Built with R version 4.2.2.
# library(car)
library(cowplot)
library(data.table)
library(DESeq2)
library(DHARMa)
library(ggpubr)
library(grid)
library(gridExtra)
library(iNEXT)
library(knitr)
library(lmPerm)
library(MASS)
library(pscl)
# library(rcompanion)
library(tidyverse)
library(vegan)
library(viridis)
# library(devtools)
# install_github("eastmallingresearch/Metabarcoding_pipeline/scripts")
library(metafuncs)
ALPHA = 0.1 # DESeq2 alpha value
OTUFILTER = 0.01 # Remove OTUs with proportion of total reads below value
READFILTER = 0.05 # Will remove samples with read sum below sample_median_reads*READFILTER
PAIREDONLY = F # Will remove the pair of samples which fail the readfilter - probably only useful for DESeq separated by type NOTE removes pairs before DESeq object is created
TAXCONF = 0.80 # Sets the taxonomy confidence level to get "rank" in taxonomy files
TOPOTU = 10 # Number of Top OTUs for summary information
DIFFOTU = 200 # Number of Top OTUs for correlation analysis
# graphics
DEVICE = "png"
DPI = 1200
WIDTH = 9
HEIGHT = 9
# Model design
DESIGN = y ~ Site + Season + Scion
FULL_DESIGN = y ~ Site * Season * Scion
canker_design = "Cankers ~ Site * Season * Scion"
# colour blind palette
cbPalette <- c(
"#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7"
)
source("functions/metabarcoding.R")
source("functions/loadme.R")
source("functions/rarefaction.R")
Bacterial and fungal ASV (ZOTU) tables, sample metadata, and taxonomy
files are loaded into named lists using the loadData
function from Greg’s metafuncs package.
Site names are encoded as follows according to previous work:
metadata <- "sample_metadata.txt"
# Load data
ubiome_BAC <- loadData("data/BAC.zotu_table.txt",metadata,"data/zBAC.sintax.taxa",RHB="BAC")
ubiome_FUN <- loadData("data/FUN.zotu_table.txt",metadata,"data/zFUN.sintax.taxa",RHB="FUN")
# Change sites Avalon -> 1, Scripps -> 2, and WWF -> 3.
# Correct for planting date month variability.
# March and April are replaced by spring, December by winter.
mutate_factors <- function(data){
data <- data %>%
rename(location = site, Scion = cultivar) %>%
mutate(
Site = case_when(
location == "Avalon" ~ 1,
location == "Scripps" ~ 2,
location == "WWF" ~ 3
) %>% as.factor(),
Season = case_when(
planting_date %in% c("march", "april") ~ "Spring",
planting_date %in% c("dec") ~ "Winter"
) %>% as.factor(),
Scion = as.factor(Scion)
)
return(data)
}
ubiome_BAC$colData <- mutate_factors(ubiome_BAC$colData)
ubiome_FUN$colData <- mutate_factors(ubiome_FUN$colData)
# Sample "A2-7" removed due to missampling.
ubiome_BAC$colData <- ubiome_BAC$colData[!rownames(ubiome_BAC$colData) %in% "HMA27", ]
ubiome_BAC$countData <- ubiome_BAC$countData[, !colnames(ubiome_BAC$countData) %in% "HMA27"]
ubiome_FUN$colData <- ubiome_FUN$colData[!rownames(ubiome_FUN$colData) %in% "HMA27", ]
ubiome_FUN$countData <- ubiome_FUN$countData[, !colnames(ubiome_FUN$countData) %in% "HMA27"]
Plantae taxa are filtered from fungal taxData.
Chloroplast and Eukaryote taxa are filtered from bacterial
taxData. Corresponding OTUs are removed from
countData.
# Filter Plant, Chloroplast, and Eukaryote OTUs
# Fungi: Plantae OTUs
cat("Fungi:", length(grep("Plantae", ubiome_FUN$taxData$kingdom)), "Plantae OTUs\n")
# Fungi: 0 Plantae OTUs
# Bacteria: Chloroplast (Streptophyta) and Eukaryote OTUs
cat(
"Bacteria:", length(grep("Streptophyta", ubiome_BAC$taxData$genus)), "Chloroplast OTUs;",
length(grep("Eukaryota", ubiome_BAC$taxData$kingdom)), "Eukaryote OTUs\n"
)
# Bacteria: 37 Chloroplast OTUs; 188 Eukaryote OTUs
# Filter Chloroplast and Eukaryote
filt <- rownames(
ubiome_BAC$taxData[
grepl("Streptophyta", ubiome_BAC$taxData$genus) &
as.numeric(ubiome_BAC$taxData$g_conf) >= TAXCONF,
]
)
filt <- c(filt, rownames(ubiome_BAC$taxData[grep("Eukaryota", ubiome_BAC$taxData$kingdom), ]))
cat("Bacteria: removing", length(filt), "OTUs")
# Bacteria: removing 198 OTUs
ubiome_BAC$taxData <- ubiome_BAC$taxData[!rownames(ubiome_BAC$taxData) %in% filt, ]
ubiome_BAC$countData <- ubiome_BAC$countData[!rownames(ubiome_BAC$countData) %in% filt, ]
Plot rarefaction curves.
Remove samples with read count below 5 % of median.
invisible(mapply(assign, names(ubiome_BAC), ubiome_BAC, MoreArgs = list(envir = globalenv())))
rare_bac <- gfunc(countData, colData, "Bacteria")
# rare_bac <- gfunc(as.data.frame(counts(dds)), as.data.frame(colData(dds)), "Bacteria ZOTU")
invisible(mapply(assign, names(ubiome_FUN), ubiome_FUN, MoreArgs = list(envir = globalenv())))
rare_fun <- gfunc(countData, colData, "Fungi")
# rare_fun <- gfunc(as.data.frame(counts(dds)), as.data.frame(colData(dds)), "Fungi ZOTU")
rarefaction_plots <- grid.arrange(
rare_bac, rare_fun,
left = textGrob(label = expression("log"[10] * " aligned sequences"), rot = 90),
bottom = "ASV count", nrow = 2
)
ggsave(filename = "rarefaction_plots.png", plot = rarefaction_plots, path = "figures/")
rarefaction_plots
# Fungi
med <- median(colSums(ubiome_FUN$countData))
filt <- !colSums(ubiome_FUN$countData) > med * READFILTER
cat("Fungi: ",sum(filt),"sample(s) removed\n")
# Bacteria
med <- median(colSums(ubiome_BAC$countData))
filt <- !colSums(ubiome_BAC$countData) > med * READFILTER
cat("Bacteria: ",sum(filt),"sample(s) removed\n")
Number of ASVs which account for 50 %, 80 %, and 99 % of total reads.
asv_propotions <- function(countData, proportion){
i <- sum(countData)
y <- rowSums(countData)
y <- y[order(y, decreasing = T)]
asvs <- length(y[(cumsum(y) / i <= proportion)])
return(asvs)
}
proportions <- c(0.5, 0.9, 0.99, 1)
top_asvs <- data.table(
"proportion" = proportions,
"Fungi" = lapply(proportions, function(x) asv_propotions(ubiome_FUN$countData, x)),
"Bacteria" = lapply(proportions, function(x) asv_propotions(ubiome_BAC$countData, x))
)
top_asvs
# proportion Fungi Bacteria
# 1: 0.50 10 169
# 2: 0.90 171 2186
# 3: 0.99 995 5883
# 4: 1.00 2401 7265
Remove ASVs with read count below 1 % of total reads.
# Fungi
keep <- filter_otus(ubiome_FUN$countData, OTUFILTER)
cat("Fungi: removing", nrow(ubiome_FUN$countData) - length(keep), "OTUs\n")
# Fungi: removing 1406 OTUs
ubiome_FUN$taxData <- ubiome_FUN$taxData[rownames(ubiome_FUN$taxData) %in% keep,]
ubiome_FUN$countData <- ubiome_FUN$countData[rownames(ubiome_FUN$countData) %in% keep,]
# Bacteria
keep <- filter_otus(ubiome_BAC$countData, OTUFILTER)
cat("Bacteria: removing", nrow(ubiome_BAC$countData) - length(keep), "OTUs")
# Bacteria: removing 1382 OTUs
ubiome_BAC$taxData <- ubiome_BAC$taxData[rownames(ubiome_BAC$taxData) %in% keep,]
ubiome_BAC$countData <- ubiome_BAC$countData[rownames(ubiome_BAC$countData) %in% keep,]
OTU normalisation is performed using qPCR theoretical copy number data. Copy number is calculated per mg of root sample from the qPCR data.
abundance <- fread("mean_abundance.csv")
# Add sample ID to abundance data
abundance$id <- paste0("HM", gsub("-", "", abundance$Sample))
# abundance$id <- abundance$Sample
abundance$copy_number <- abundance$MeanAdjustedTCN_mg
abundance$log_copy_number <- log10(abundance$copy_number)
# Add bacterial (16S) and fungal (ITS) abundance to ubiome BAC and FUN named lists
ubiome_FUN$abundance <- abundance[abundance$Target == "ITS"] %>%
column_to_rownames(var = "id")
ubiome_BAC$abundance <- abundance[abundance$Target == "16S"] %>%
column_to_rownames(var = "id")
# Merge copy number from abundance with colData
ubiome_FUN$colData <- merge(
ubiome_FUN$colData,
ubiome_FUN$abundance[, c("Target", "copy_number", "log_copy_number")],
by = 0
) %>% column_to_rownames(var = "Row.names")
ubiome_BAC$colData <- merge(
ubiome_BAC$colData,
ubiome_BAC$abundance[, c("Target", "copy_number", "log_copy_number")],
by = 0
) %>% column_to_rownames(var = "Row.names")
# Detect outliers with std > threshold from the median
detect_outliers <- function(x, val, threshold, na.rm = TRUE) {
med_x <- median(x[[val]], na.rm = na.rm)
sd_x <- sd(x[[val]], na.rm = na.rm)
outliers <- x[x[[val]] > (med_x + threshold * sd_x) | x[[val]] < (med_x - threshold * sd_x), ]
return(outliers)
}
outliers_FUN <- detect_outliers(ubiome_FUN$abundance, "MeanAdjustedTCN_mg", 3)
outliers_BAC <- detect_outliers(ubiome_BAC$abundance, "MeanAdjustedTCN_mg", 3)
# Remove samples with copy number > 3 std from the median
outliers <- rownames(outliers_FUN)
ubiome_FUN$abundance <- ubiome_FUN$abundance[!rownames(ubiome_FUN$abundance) %in% outliers, ]
ubiome_FUN$countData <- ubiome_FUN$countData[, !colnames(ubiome_FUN$countData) %in% outliers]
ubiome_FUN$colData <- ubiome_FUN$colData[!rownames(ubiome_FUN$colData) %in% outliers, ]
cat("Fungi: removing", length(outliers), "outlier(s)\n")
# Fungi: removing 1 outlier(s)
Sample A1-3 is removed from the fungal data due to abnormally high copy number.
Canker count data for sampled trees only.
# Canker count data for sampled trees only
canker_data <- fread("canker_data.csv", select = c(1:5, 7:34))
# Remove spaces from column names and convert to lowercase
colnames(canker_data) <- tolower(gsub(" ", "_", colnames(canker_data)))
# Codify site names, add planting season and total canker count for timepoint 4
canker_data <- mutate(
canker_data,
Site = case_when(
site == "Avalon" ~ 1,
site == "Scripps" ~ 2,
site == "WWF" ~ 3
) %>% as.factor(),
Season = case_when(
planting_date %in% c("March", "April") ~ "Spring",
planting_date %in% c("Dec") ~ "Winter"
),
Scion = as.factor(cultivar),
total_cankers = a4 + b4 + c4 + d4 + e4
)
# Identify samples with missing values
missing <- unique(canker_data[!complete.cases(canker_data), code])
# Also remove sample A2-7 due to missampling
missing <- c(missing, "HMA27")
# Remove missing samples from canker data
canker_data <- canker_data[!canker_data$code %in% missing, ]
# Verify that there are two trees for each sample
canker_data %>% group_by(code) %>% summarise(n = n()) %>% filter(n != 2)
# # A tibble: 0 × 2
# # … with 2 variables: code <chr>, n <int>
# Sum of total cankers for each pair of trees with matching code
cankers <- canker_data %>%
group_by(code) %>%
summarise(
Site = first(Site),
Season = first(Season),
Scion = first(Scion),
Cankers = sum(total_cankers)
) %>%
column_to_rownames("code")
# Add total canker count to colData for both FUN and BAC
ubiome_FUN$colData <- merge(
ubiome_FUN$colData,
cankers["Cankers"],
by = 0,
all.x = T
) %>% column_to_rownames("Row.names")
ubiome_BAC$colData <- merge(
ubiome_BAC$colData,
cankers["Cankers"],
by = 0,
all.x = T
) %>% column_to_rownames("Row.names")
Summary stats
# png("figures/hist.png", width = 800, height = 600)
# hist(cankers$Cankers, breaks = 20, main = "Total canker count", xlab = "Total canker count")
# dev.off()
cankers_hist <- ggdensity(
cankers, x = "Cankers", fill = "Site", facet.by = "Site", ncol = 1,
add = "mean", rug = T, palette = cbPalette,
title = "Total canker count", xlab = "Total canker count"
)
cankers_hist
ggsave(filename = "cankers_hist.png", plot = cankers_hist, path = "figures/")
cankers_box <- ggboxplot(
cankers, x = "Site", y = "Cankers", palette = cbPalette,
color = "Scion", add = "jitter", legend = "top",
title = "Total canker count", xlab = "Site", ylab = "Total canker count"
)
cankers_box
ggsave(filename = "cankers_box.png", plot = cankers_box, path = "figures/")
cankers_bar <- ggbarplot(
cankers, x = "Site", y = "Cankers", fill = "Scion",
palette = cbPalette, add = "mean_se", position = position_dodge(0.8),
title = "Total canker count", xlab = "Site", ylab = "Total canker count"
)
cankers_bar
ggsave(filename = "cankers_bar.png", plot = cankers_bar, path = "figures/")
GLM
# Effect of Site, Scion, and Season on canker count
# Formula
formula <- update(FULL_DESIGN, Cankers ~ .)
# formula <- Cankers ~ Site + Season + Scion + site:Season + site:Scion + Season:Scion
# Log-linear model
canker_lm <- lm(update(FULL_DESIGN, log(Cankers + 1) ~ .), data = cankers)
par(mfrow = c(2, 2))
plot(canker_lm)
# Residual checking
res <- resid(canker_lm, type = "pearson")
# Poisson model
canker_poisson <- glm(formula, data = cankers, family = "poisson")
poisson_plot <- plot(simulateResiduals(canker_poisson), title = "Poisson model")
# Model overdispersed
# Negative binomial model
canker_negbin <- glm.nb(formula, data = cankers)
sim <- simulateResiduals(canker_negbin)
plot(sim, title = "Negative binomial model")
# canker_model_plots <- ggarrange(lm_plot, poisson_plot, negbin_plot, ncol = 3)
# ggsave(filename = "canker_model_plots.png", plot = canker_model_plots, path = "figures/")
# png("figures/canker_residuals.png", width = 800, height = 600)
# plot(sim)
# dev.off()
testZeroInflation(sim)
#
# DHARMa zero-inflation test via comparison to expected zeros with simulation under H0 = fitted model
#
# data: simulationOutput
# ratioObsSim = 0.77399, p-value = 0.76
# alternative hypothesis: two.sided
nagelkerke(canker_negbin)
# Error in nagelkerke(canker_negbin): could not find function "nagelkerke"
# Model good fit
canker_anova <- anova(canker_negbin, test = "Chisq") %>% data.frame()
total_deviance <- sum(canker_anova$Deviance, na.rm = T) + tail(canker_anova$Resid..Dev, 1)
canker_anova$Perc..Dev <- canker_anova$Deviance / total_deviance * 100
canker_anova
# Df Deviance Resid..Df Resid..Dev Pr..Chi. Perc..Dev
# NULL NA NA 73 314.98196 NA NA
# Site 2 118.75744595 71 196.22452 1.629852e-26 37.702935303
# Season 1 0.02066259 70 196.20386 8.857019e-01 0.006559927
# Scion 6 24.29375368 64 171.91010 4.611042e-04 7.712744377
# Site:Season 2 28.86115598 62 143.04895 5.406044e-07 9.162796386
# Site:Scion 12 31.65405909 50 111.39489 1.564383e-03 10.049483066
# Season:Scion 6 7.40666778 44 103.98822 2.848693e-01 2.351457744
# Site:Season:Scion 11 26.62949240 33 77.35873 5.225415e-03 8.454291190
# Make sure countData and colData still match, if they do, create DESeq objects, if not throw error
if(identical(colnames(ubiome_FUN$countData), rownames(ubiome_FUN$colData))) {
# Create DESeq object
ubiome_FUN$dds <- ubiom_to_des(ubiome_FUN)
print("FUN DESeq object created")
} else {
stop("FUN countData and colData do not match")
}
# [1] "FUN DESeq object created"
if(identical(colnames(ubiome_BAC$countData), rownames(ubiome_BAC$colData))) {
# Create DESeq object
ubiome_BAC$dds <- ubiom_to_des(ubiome_BAC)
print("BAC DESeq object created")
} else {
stop("BAC countData and colData do not match")
}
# [1] "BAC DESeq object created"
Absolute abundance normalisation using DESeq2 size factors.
Values are centred around the mean of the copy number.
# Normalise count data using DESeq2 size factors
ubiome_FUN$dds$sizeFactor <- ubiome_FUN$dds$copy_number / mean(ubiome_FUN$dds$copy_number)
ubiome_BAC$dds$sizeFactor <- ubiome_BAC$dds$copy_number / mean(ubiome_BAC$dds$copy_number)
# Save environment
save.image("data_loaded.RData")
# Unpack fungi data
invisible(mapply(assign, names(ubiome_FUN), ubiome_FUN, MoreArgs = list(envir = globalenv())))
cat(
"Raw reads", "\n\n",
"Total raw reads:\t\t", sum(countData), "\n",
"Mean raw reads per sample:\t", mean(colSums(countData)), "\n",
"Median raw reads per sample:\t", median(colSums(countData)), "\n",
"Max raw reads per sample:\t", max(colSums(countData)), "\n",
"Min raw reads per sample:\t", min(colSums(countData)), "\n\n"
)
# Raw reads
#
# Total raw reads: 7293776
# Mean raw reads per sample: 90046.62
# Median raw reads per sample: 93435
# Max raw reads per sample: 113518
# Min raw reads per sample: 38472
#colSums(countData)
nct <- counts(dds, normalize = T)
cat("Normalised reads", "\n\n",
"Total normalised reads:\t\t", sum(nct), "\n",
"Mean normalised reads per sample:\t", mean(colSums(nct)), "\n",
"Median normalised reads per sample:\t", median(colSums(nct)), "\n",
"Min normalised reads per sample:\t", min(colSums(nct)), "\n",
"Max normalised reads per sample:\t", max(colSums(nct)), "\n\n"
)
# Normalised reads
#
# Total normalised reads: 12468857
# Mean normalised reads per sample: 153936.5
# Median normalised reads per sample: 98624.28
# Min normalised reads per sample: 28901.7
# Max normalised reads per sample: 881441.3
#round(colSums(counts(dds,normalize = T)),0)
cat(
"Total OTUs:\t\t", nrow(taxData),"\n\n",
"Raw reads per OTU summary", "\n\n",
"Mean raw reads per OTU:\t", mean(rowSums(countData)),"\n",
"Median raw per OTU:\t\t", median(rowSums(countData)),"\n",
"OTU raw Min reads:\t\t", min(rowSums(countData)),"\n",
"OTU raw Max reads:\t\t", max(rowSums(countData)),"\n\n"
)
# Total OTUs: 995
#
# Raw reads per OTU summary
#
# Mean raw reads per OTU: 7330.428
# Median raw per OTU: 588
# OTU raw Min reads: 115
# OTU raw Max reads: 714327
cat(
"Normalised reads per OTU summary","\n\n",
"Mean normalised reads per OTU:\t\t", mean(rowSums(nct)),"\n",
"Median normalised reads per OTU:\t", median(rowSums(nct)),"\n",
"OTU normalised Min reads:\t\t", min(rowSums(nct)),"\n",
"OTU normalised Max reads:\t\t", max(rowSums(nct)),"\n\n"
)
# Normalised reads per OTU summary
#
# Mean normalised reads per OTU: 12531.51
# Median normalised reads per OTU: 1025.725
# OTU normalised Min reads: 101.2814
# OTU normalised Max reads: 1509459
y <- rowSums(nct)
y <- y[order(y, decreasing = T)]
# proportion
xy <- y / sum(y)
cat("Top " ,TOPOTU, "OTUs:\n")
# Top 10 OTUs:
data.frame(counts = y[1:TOPOTU], proportion = xy[1:TOPOTU], rank = taxData[names(y)[1:TOPOTU],]$rank)
# counts proportion rank
# OTU2 1509458.8 0.12105832 Ascomycota(p)
# OTU1 1490469.5 0.11953538 Dactylonectria macrodidyma(s)
# OTU5 1068164.1 0.08566657 Leotiomycetes(c)
# OTU4 1059908.0 0.08500442 Ascomycota(p)
# OTU3 480660.1 0.03854885 Ilyonectria destructans(s)
# OTU7 290896.6 0.02332985 Fusarium(g)
# OTU6 227927.9 0.01827978 Ilyonectria robusta(s)
# OTU9 201690.6 0.01617555 Ascomycota(p)
# OTU8 191083.5 0.01532486 Fusarium(g)
# OTU11 131684.2 0.01056105 Truncatella angustata(s)
Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank.
# Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank
tx <- copy(taxData)
setDT(tx)
cols <- names(tx)[9:15]
tx[, (cols) := lapply(.SD, as.factor), .SDcols = cols]
data.table(
rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
"0.8" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.8) / nrow(tx))), 2),
"0.65" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.65) / nrow(tx))), 2),
"0.5" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.5) / nrow(tx))), 2)
)
# rank 0.8 0.65 0.5
# 1: kingdom 1.00 1.00 1.00
# 2: phylum 0.84 0.87 0.90
# 3: class 0.70 0.74 0.78
# 4: order 0.54 0.60 0.64
# 5: family 0.42 0.45 0.49
# 6: genus 0.38 0.42 0.47
# 7: species 0.24 0.30 0.35
% of reads which can be assigned to each taxonomic ranks
tx <-taxData[rownames(dds),]
nc <- counts(dds, normalize = T)
ac <- sum(nc)
data.table(
rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
"0.8" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.8),]) / ac * 100))), 2),
"0.65" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.65),]) / ac * 100))), 2),
"0.5" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.5),]) / ac * 100))), 2)
)
# rank 0.8 0.65 0.5
# 1: kingdom 100.00 100.00 100.00
# 2: phylum 84.14 96.59 96.83
# 3: class 60.12 70.92 71.47
# 4: order 53.49 58.87 68.76
# 5: family 44.97 46.80 50.25
# 6: genus 46.06 48.01 50.72
# 7: species 30.44 36.70 41.62
Plot copy number for each sample grouped by site, Scion, and Season. Test the effect of site, Scion, and Season on copy number using ANOVA.
# abundance_plot <- ggplot(
# data = as.data.frame(colData(dds)),
# aes(x = site, y = log_copy_number, colour = Scion, shape = Season)
# ) + geom_jitter() +
# scale_colour_manual(values = cbPalette)
fun_abundance_box <- ggboxplot(
data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number",
color = "Scion", add = "jitter", legend = "top",
title = "Fungal abundance", xlab = "Site", ylab = "log10 copy number"
)
ggsave(
filename = "fun_abundance.png", plot = fun_abundance_box, path = "figures/",
height = 20, width = 20, units = "cm"
)
fun_abundance_box
fun_abundance_bar <- ggbarplot(
data = as.data.frame(colData(dds)), x = "Season", y = "log_copy_number",
fill = "Site", add = "mean_se",
palette = cbPalette, position = position_dodge(0.8),
title = "(a) Fungal abundance", xlab = "Planting Season", ylab = "Mean copy number (log10)"
) + guides(fill = guide_legend(title = "Site"))
ggsave(
filename = "fun_abundance_bar.png", plot = fun_abundance_bar, path = "figures/",
height = 20, width = 20, units = "cm"
)
fun_abundance_bar
# Formula for ANOVA
formula <- update(FULL_DESIGN, log_copy_number ~ .)
abundance_anova <- aov(formula, data = as.data.frame(colData(dds)))
# Normality check
par(mfrow = c(2, 2))
plot(abundance_anova)
png("figures/fun_abundance_norm.png", width = 800, height = 600)
par(mfrow = c(2, 2))
plot(abundance_anova)
dev.off()
# png
# 2
# Results
summary(abundance_anova)
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 0.861 0.4306 4.663 0.0153 *
# Season 1 0.501 0.5012 5.427 0.0251 *
# Scion 6 0.477 0.0795 0.860 0.5324
# Site:Season 2 0.683 0.3415 3.698 0.0338 *
# Site:Scion 12 1.203 0.1003 1.086 0.3981
# Season:Scion 6 0.458 0.0763 0.827 0.5564
# Site:Season:Scion 12 0.918 0.0765 0.828 0.6214
# Residuals 39 3.602 0.0924
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abundance_results <- abundance_anova %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(abundance_results$Sum.Sq)
abundance_results$Perc.Var <- abundance_results$Sum.Sq / total_variance * 100
abundance_results
# Df Sum.Sq Mean.Sq F.value Pr..F. Perc.Var
# Site 2 0.8612429 0.43062144 4.6626824 0.01528779 9.895484
# Season 1 0.5012156 0.50121559 5.4270616 0.02509695 5.758852
# Scion 6 0.4767020 0.07945034 0.8602723 0.53239080 5.477198
# Site:Season 2 0.6830842 0.34154212 3.6981494 0.03383255 7.848482
# Site:Scion 12 1.2031665 0.10026388 1.0856371 0.39812911 13.824108
# Season:Scion 6 0.4580790 0.07634649 0.8266645 0.55642162 5.263223
# Site:Season:Scion 12 0.9180631 0.07650525 0.8283835 0.62137102 10.548335
# Residuals 39 3.6018401 0.09235487 NA NA 41.384319
# plot alpha diversity - plot_alpha will convert normalised abundances to integer values
fun_alpha_plot <- plot_alpha(
counts(dds, normalize = F), colData(dds),
design = "Scion", colour = "Site",
measures = c("Shannon", "Simpson"),
type = "bar"
) + scale_colour_manual(values = cbPalette) +
theme(axis.title.x = element_blank()) +
ggtitle("Fungal α-diversity")
ggsave(
filename = "fun_alpha.png", plot = fun_alpha_plot, path = "figures/",
height = 20, width = 40, units = "cm"
)
fun_alpha_plot
# get the diversity index data
all_alpha_ord <- plot_alpha(
counts(dds, normalize = F),
colData(dds),
returnData = T
)
# join diversity indices and metadata
all_alpha_ord <- all_alpha_ord[
as.data.table(colData(dds), keep.rownames = "Samples"),
on = "Samples"
]
fun_alpha <- all_alpha_ord
formula <- FULL_DESIGN # x ~ Site * Season * Scion + Site / Site.block
setkey(all_alpha_ord, S.chao1)
all_alpha_ord[, measure := as.numeric(as.factor(S.chao1))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings: sequential SS "
summary(result)
# Component 1 :
# Df R Sum Sq R Mean Sq Iter Pr(Prob)
# Site 2 11554.5 5777.2 5000 < 2e-16 ***
# Season 1 2056.4 2056.4 1079 0.08526 .
# Site:Season 2 812.4 406.2 270 0.47778
# Scion 6 875.7 145.9 315 0.93968
# Site:Scion 12 2817.7 234.8 762 0.93832
# Season:Scion 6 2046.5 341.1 206 0.88350
# Site:Season:Scion 12 2735.3 227.9 431 0.94896
# Residuals 39 21381.5 548.2
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
# Df R.Sum.Sq R.Mean.Sq Iter Pr.Prob. Perc.Var
# Site 2 11554.4684 5777.2342 5000 0.00000000 26.094102
# Season 1 2056.4245 2056.4245 1079 0.08526413 4.644139
# Site:Season 2 812.3544 406.1772 270 0.47777778 1.834585
# Scion 6 875.6763 145.9461 315 0.93968254 1.977589
# Site:Scion 12 2817.7201 234.8100 762 0.93832021 6.363415
# Season:Scion 6 2046.5389 341.0898 206 0.88349515 4.621813
# Site:Season:Scion 12 2735.3173 227.9431 431 0.94895592 6.177320
# Residuals 39 21381.5000 548.2436 NA NA 48.287037
setkey(all_alpha_ord, shannon)
all_alpha_ord[, measure := as.numeric(as.factor(shannon))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings: sequential SS "
summary(result)
# Component 1 :
# Df R Sum Sq R Mean Sq Iter Pr(Prob)
# Site 2 12291.8 6145.9 5000 < 2e-16 ***
# Season 1 1077.6 1077.6 1219 0.07629 .
# Site:Season 2 2320.4 1160.2 2067 0.04644 *
# Scion 6 570.9 95.1 188 1.00000
# Site:Scion 12 3082.1 256.8 411 0.87348
# Season:Scion 6 2730.6 455.1 2262 0.42485
# Site:Season:Scion 12 5311.1 442.6 1751 0.54426
# Residuals 39 16895.5 433.2
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
# Df R.Sum.Sq R.Mean.Sq Iter Pr.Prob. Perc.Var
# Site 2 12291.7504 6145.87520 5000 0.00000000 27.759147
# Season 1 1077.5749 1077.57489 1219 0.07629204 2.433548
# Site:Season 2 2320.4220 1160.21098 2067 0.04644412 5.240339
# Scion 6 570.8675 95.14458 188 1.00000000 1.289222
# Site:Scion 12 3082.1462 256.84552 411 0.87347932 6.960583
# Season:Scion 6 2730.6068 455.10114 2262 0.42484527 6.166682
# Site:Season:Scion 12 5311.1323 442.59435 1751 0.54426042 11.994427
# Residuals 39 16895.5000 433.21795 NA NA 38.156052
setkey(all_alpha_ord, simpson)
all_alpha_ord[, measure := as.numeric(as.factor(simpson))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings: sequential SS "
summary(result)
# Component 1 :
# Df R Sum Sq R Mean Sq Iter Pr(Prob)
# Site 2 12937.5 6468.8 5000 < 2e-16 ***
# Season 1 764.2 764.2 392 0.20408
# Site:Season 2 2484.2 1242.1 2641 0.05604 .
# Scion 6 1188.1 198.0 1059 0.87158
# Site:Scion 12 2027.8 169.0 551 0.97641
# Season:Scion 6 2529.6 421.6 1215 0.37531
# Site:Season:Scion 12 5334.6 444.6 2035 0.46781
# Residuals 39 17014.0 436.3
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
# Df R.Sum.Sq R.Mean.Sq Iter Pr.Prob. Perc.Var
# Site 2 12937.5044 6468.7522 5000 0.00000000 29.217490
# Season 1 764.2007 764.2007 392 0.20408163 1.725837
# Site:Season 2 2484.2454 1242.1227 2641 0.05603938 5.610310
# Scion 6 1188.0542 198.0090 1059 0.87157696 2.683049
# Site:Scion 12 2027.7819 168.9818 551 0.97640653 4.579453
# Season:Scion 6 2529.5940 421.5990 1215 0.37530864 5.712724
# Site:Season:Scion 12 5334.6194 444.5516 2035 0.46781327 12.047469
# Residuals 39 17014.0000 436.2564 NA NA 38.423668
# Number of PCs to include
n_pcs <- 20
# Perform PC decomposition of DES object
mypca <- des_to_pca(dds)
# To get pca plot axis into the same scale create a dataframe of PC scores multiplied by their variance
fun_pca <- t(data.frame(t(mypca$x) * mypca$percentVar))
formula = FULL_DESIGN
# Cumulative percentage of variance explained
pca_cum_var <- data.frame(
cumulative = cumsum(mypca$percentVar * 100),
no = 1:length(mypca$percentVar)
)
# Plot cumulative percentage of variance explained
fun_cum_pca <- ggline(
pca_cum_var, x = "no", y = "cumulative", plot_type = "l",
xlab = "Number of PCs", ylab = "Cumulative % variance explained",
title = "Fungi: cumulative % variance explained by PCs"
)
ggsave(filename = "fun_cum_pca.png", plot = fun_cum_pca, path = "figures/",)
fun_cum_pca
pca_var <- data.frame(
row.names = paste0("PC", 1:n_pcs),
perc_var = round(mypca$percentVar[1:n_pcs] * 100, 1)
)
pca_var
# perc_var
# PC1 27.1
# PC2 21.2
# PC3 8.4
# PC4 4.2
# PC5 3.3
# PC6 2.6
# PC7 1.9
# PC8 1.8
# PC9 1.7
# PC10 1.6
# PC11 1.3
# PC12 1.3
# PC13 1.2
# PC14 1.0
# PC15 1.0
# PC16 0.9
# PC17 0.9
# PC18 0.9
# PC19 0.8
# PC20 0.8
pca_summary <- apply(
mypca$x[, 1:n_pcs], 2,
function(x){
summary(aov(update(formula, x ~ .), data = as.data.frame(cbind(x, colData(dds)))))
}
)
pca_summary
# $PC1
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 71073 35537 238.166 < 2e-16 ***
# Season 1 31 31 0.208 0.65102
# Scion 6 871 145 0.973 0.45635
# Site:Season 2 2232 1116 7.480 0.00178 **
# Site:Scion 12 1299 108 0.726 0.71791
# Season:Scion 6 696 116 0.777 0.59290
# Site:Season:Scion 12 1781 148 0.994 0.47159
# Residuals 39 5819 149
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC2
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 41657 20829 62.213 7.35e-13 ***
# Season 1 411 411 1.228 0.27459
# Scion 6 125 21 0.062 0.99892
# Site:Season 2 5403 2701 8.069 0.00117 **
# Site:Scion 12 2269 189 0.565 0.85649
# Season:Scion 6 416 69 0.207 0.97246
# Site:Season:Scion 12 2080 173 0.518 0.89032
# Residuals 39 13057 335
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC3
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 2091 1045.4 3.075 0.0575 .
# Season 1 250 250.4 0.737 0.3960
# Scion 6 976 162.7 0.479 0.8202
# Site:Season 2 1598 798.8 2.350 0.1087
# Site:Scion 12 3182 265.2 0.780 0.6668
# Season:Scion 6 1399 233.2 0.686 0.6619
# Site:Season:Scion 12 3176 264.7 0.779 0.6682
# Residuals 39 13257 339.9
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC4
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 1692 846.2 5.486 0.00795 **
# Season 1 156 155.8 1.010 0.32103
# Scion 6 166 27.7 0.179 0.98087
# Site:Season 2 1216 608.2 3.943 0.02757 *
# Site:Scion 12 1399 116.6 0.756 0.68963
# Season:Scion 6 637 106.1 0.688 0.66035
# Site:Season:Scion 12 1578 131.5 0.853 0.59855
# Residuals 39 6016 154.2
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC5
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 238 119.1 0.739 0.484
# Season 1 457 456.7 2.835 0.100
# Scion 6 326 54.4 0.338 0.913
# Site:Season 2 313 156.3 0.970 0.388
# Site:Scion 12 1292 107.7 0.669 0.770
# Season:Scion 6 534 88.9 0.552 0.765
# Site:Season:Scion 12 680 56.7 0.352 0.973
# Residuals 39 6281 161.1
#
# $PC6
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 141 70.29 0.602 0.553
# Season 1 279 278.51 2.384 0.131
# Scion 6 225 37.51 0.321 0.922
# Site:Season 2 262 130.85 1.120 0.337
# Site:Scion 12 428 35.65 0.305 0.985
# Season:Scion 6 582 96.92 0.830 0.554
# Site:Season:Scion 12 1540 128.37 1.099 0.388
# Residuals 39 4557 116.85
#
# $PC7
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 44.9 22.47 0.283 0.755
# Season 1 16.0 16.02 0.202 0.655
# Scion 6 689.5 114.92 1.450 0.221
# Site:Season 2 13.3 6.67 0.084 0.919
# Site:Scion 12 1060.5 88.37 1.115 0.376
# Season:Scion 6 265.0 44.16 0.557 0.761
# Site:Season:Scion 12 838.4 69.87 0.881 0.572
# Residuals 39 3091.5 79.27
#
# $PC8
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 1.1 0.6 0.008 0.9919
# Season 1 425.5 425.5 6.057 0.0184 *
# Scion 6 147.8 24.6 0.351 0.9051
# Site:Season 2 769.7 384.9 5.478 0.0080 **
# Site:Scion 12 613.2 51.1 0.727 0.7163
# Season:Scion 6 594.6 99.1 1.411 0.2352
# Site:Season:Scion 12 401.4 33.5 0.476 0.9166
# Residuals 39 2739.9 70.3
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC9
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 74.9 37.4 0.648 0.5287
# Season 1 210.8 210.8 3.647 0.0635 .
# Scion 6 246.7 41.1 0.711 0.6425
# Site:Season 2 1500.6 750.3 12.982 4.77e-05 ***
# Site:Scion 12 418.6 34.9 0.604 0.8258
# Season:Scion 6 238.5 39.8 0.688 0.6606
# Site:Season:Scion 12 206.8 17.2 0.298 0.9860
# Residuals 39 2254.0 57.8
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC10
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 17.2 8.62 0.107 0.898
# Season 1 13.0 12.98 0.162 0.690
# Scion 6 279.3 46.55 0.580 0.744
# Site:Season 2 112.7 56.35 0.702 0.502
# Site:Scion 12 544.6 45.38 0.566 0.856
# Season:Scion 6 80.8 13.46 0.168 0.984
# Site:Season:Scion 12 646.4 53.87 0.671 0.767
# Residuals 39 3129.1 80.23
#
# $PC11
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 13.2 6.58 0.090 0.915
# Season 1 7.3 7.30 0.099 0.754
# Scion 6 170.3 28.38 0.386 0.883
# Site:Season 2 54.8 27.42 0.373 0.691
# Site:Scion 12 328.5 27.38 0.373 0.966
# Season:Scion 6 344.3 57.38 0.781 0.590
# Site:Season:Scion 12 372.9 31.08 0.423 0.945
# Residuals 39 2865.9 73.48
#
# $PC12
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 55.1 27.54 0.547 0.583
# Season 1 105.0 105.05 2.086 0.157
# Scion 6 196.0 32.66 0.648 0.691
# Site:Season 2 144.5 72.25 1.434 0.251
# Site:Scion 12 722.8 60.24 1.196 0.320
# Season:Scion 6 378.6 63.11 1.253 0.301
# Site:Season:Scion 12 560.2 46.68 0.927 0.531
# Residuals 39 1964.5 50.37
#
# $PC13
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 18.7 9.37 0.150 0.861
# Season 1 32.2 32.21 0.517 0.476
# Scion 6 76.6 12.76 0.205 0.973
# Site:Season 2 92.4 46.18 0.742 0.483
# Site:Scion 12 127.3 10.61 0.170 0.999
# Season:Scion 6 213.6 35.60 0.572 0.750
# Site:Season:Scion 12 639.9 53.32 0.857 0.595
# Residuals 39 2427.9 62.25
#
# $PC14
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 7.9 3.93 0.109 0.8970
# Season 1 6.4 6.45 0.179 0.6749
# Scion 6 168.8 28.14 0.780 0.5909
# Site:Season 2 114.4 57.19 1.585 0.2179
# Site:Scion 12 346.5 28.88 0.800 0.6480
# Season:Scion 6 509.5 84.92 2.353 0.0491 *
# Site:Season:Scion 12 625.9 52.15 1.445 0.1875
# Residuals 39 1407.4 36.09
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC15
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 4.0 2.02 0.055 0.946
# Season 1 13.7 13.67 0.375 0.544
# Scion 6 131.7 21.94 0.601 0.728
# Site:Season 2 15.1 7.55 0.207 0.814
# Site:Scion 12 560.8 46.74 1.280 0.269
# Season:Scion 6 322.9 53.81 1.474 0.212
# Site:Season:Scion 12 503.5 41.96 1.149 0.352
# Residuals 39 1423.7 36.51
#
# $PC16
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 1.2 0.62 0.017 0.983
# Season 1 36.3 36.33 1.007 0.322
# Scion 6 152.9 25.49 0.707 0.646
# Site:Season 2 143.2 71.62 1.986 0.151
# Site:Scion 12 535.6 44.64 1.238 0.294
# Season:Scion 6 349.3 58.22 1.615 0.169
# Site:Season:Scion 12 189.8 15.82 0.439 0.937
# Residuals 39 1406.4 36.06
#
# $PC17
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 4.5 2.26 0.058 0.944
# Season 1 20.5 20.49 0.527 0.472
# Scion 6 218.1 36.35 0.935 0.481
# Site:Season 2 99.3 49.66 1.277 0.290
# Site:Scion 12 351.3 29.27 0.753 0.692
# Season:Scion 6 163.4 27.23 0.700 0.651
# Site:Season:Scion 12 344.9 28.75 0.739 0.705
# Residuals 39 1516.2 38.88
#
# $PC18
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 33.5 16.77 0.532 0.592
# Season 1 6.3 6.27 0.199 0.658
# Scion 6 305.1 50.85 1.613 0.170
# Site:Season 2 40.2 20.08 0.637 0.534
# Site:Scion 12 540.3 45.02 1.428 0.195
# Season:Scion 6 299.6 49.94 1.584 0.178
# Site:Season:Scion 12 225.2 18.77 0.595 0.833
# Residuals 39 1229.8 31.53
#
# $PC19
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 5.4 2.68 0.081 0.922
# Season 1 1.4 1.44 0.043 0.836
# Scion 6 246.4 41.06 1.244 0.306
# Site:Season 2 12.7 6.35 0.192 0.826
# Site:Scion 12 316.6 26.39 0.799 0.649
# Season:Scion 6 202.4 33.73 1.022 0.426
# Site:Season:Scion 12 372.3 31.02 0.940 0.519
# Residuals 39 1287.3 33.01
#
# $PC20
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 24.8 12.39 0.356 0.703
# Season 1 23.8 23.81 0.683 0.414
# Scion 6 154.0 25.66 0.736 0.624
# Site:Season 2 17.1 8.56 0.246 0.783
# Site:Scion 12 180.6 15.05 0.432 0.941
# Season:Scion 6 146.0 24.33 0.698 0.653
# Site:Season:Scion 12 501.6 41.80 1.199 0.318
# Residuals 39 1359.6 34.86
pcas <- lapply(pca_summary, function(i) data.frame(unclass(i)))
pca_factors <- data.table(
PCs = names(pcas),
"Total_%" = round(pca_var$perc_var, 2),
"Site_%" = round(sapply(pcas, function(x) (x['Site ', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Site_*p*" = round(sapply(pcas, function(x) x['Site ', 'Pr..F.']), 4),
"Scion_%" = round(sapply(pcas, function(x) (x['Scion', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Scion_*p*" = round(sapply(pcas, function(x) x['Scion', 'Pr..F.']), 4),
"Season_%" = round(sapply(pcas, function(x) (x['Season ', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Season_*p*" = round(sapply(pcas, function(x) x['Season ', 'Pr..F.']), 4),
"Site:Scion_%" = round(sapply(pcas, function(x) (x['Site:Scion', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Site:Scion_*p*" = round(sapply(pcas, function(x) x['Site:Scion', 'Pr..F.']), 4),
"Site:Season_%" = round(sapply(pcas, function(x) (x['Site:Season ', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Site:Season_*p*" = round(sapply(pcas, function(x) x['Site:Season ', 'Pr..F.']), 4),
"Season:Scion_%" = round(sapply(pcas, function(x) (x['Season:Scion', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Season:Scion_*p*" = round(sapply(pcas, function(x) x['Season:Scion', 'Pr..F.']), 4),
"Residuals_%" = round(sapply(pcas, function(x) (x['Residuals', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2)
)
kable(pca_factors)
| PCs | Total_% | Site_% | Site_p | Scion_% | Scion_p | Season_% | Season_p | Site:Scion_% | Site:Scion_p | Site:Season_% | Site:Season_p | Season:Scion_% | Season:Scion_p | Residuals_% |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PC1 | 27.1 | 84.81 | 0.0000 | 1.04 | 0.4564 | 0.04 | 0.6510 | 1.55 | 0.7179 | 2.66 | 0.0018 | 0.83 | 0.5929 | 6.94 |
| PC2 | 21.2 | 63.68 | 0.0000 | 0.19 | 0.9989 | 0.63 | 0.2746 | 3.47 | 0.8565 | 8.26 | 0.0012 | 0.64 | 0.9725 | 19.96 |
| PC3 | 8.4 | 8.06 | 0.0575 | 3.76 | 0.8202 | 0.97 | 0.3960 | 12.27 | 0.6668 | 6.16 | 0.1087 | 5.40 | 0.6619 | 51.13 |
| PC4 | 4.2 | 13.16 | 0.0080 | 1.29 | 0.9809 | 1.21 | 0.3210 | 10.88 | 0.6896 | 9.46 | 0.0276 | 4.95 | 0.6603 | 46.78 |
| PC5 | 3.3 | 2.35 | 0.4840 | 3.23 | 0.9127 | 4.51 | 0.1002 | 12.77 | 0.7699 | 3.09 | 0.3879 | 5.27 | 0.7653 | 62.06 |
| PC6 | 2.6 | 1.75 | 0.5530 | 2.81 | 0.9220 | 3.48 | 0.1307 | 5.34 | 0.9846 | 3.27 | 0.3366 | 7.26 | 0.5544 | 56.87 |
| PC7 | 1.9 | 0.75 | 0.7548 | 11.46 | 0.2209 | 0.27 | 0.6555 | 17.62 | 0.3763 | 0.22 | 0.9195 | 4.40 | 0.7615 | 51.36 |
| PC8 | 1.8 | 0.02 | 0.9919 | 2.60 | 0.9051 | 7.47 | 0.0184 | 10.77 | 0.7163 | 13.52 | 0.0080 | 10.44 | 0.2352 | 48.12 |
| PC9 | 1.7 | 1.45 | 0.5287 | 4.79 | 0.6425 | 4.09 | 0.0635 | 8.13 | 0.8258 | 29.13 | 0.0000 | 4.63 | 0.6606 | 43.76 |
| PC10 | 1.6 | 0.36 | 0.8984 | 5.79 | 0.7438 | 0.27 | 0.6898 | 11.29 | 0.8558 | 2.34 | 0.5016 | 1.67 | 0.9839 | 64.88 |
| PC11 | 1.3 | 0.32 | 0.9145 | 4.10 | 0.8833 | 0.18 | 0.7543 | 7.90 | 0.9656 | 1.32 | 0.6910 | 8.28 | 0.5900 | 68.94 |
| PC12 | 1.3 | 1.33 | 0.5832 | 4.75 | 0.6910 | 2.55 | 0.1567 | 17.52 | 0.3200 | 3.50 | 0.2506 | 9.18 | 0.3014 | 47.60 |
| PC13 | 1.2 | 0.52 | 0.8608 | 2.11 | 0.9732 | 0.89 | 0.4763 | 3.51 | 0.9989 | 2.55 | 0.4828 | 5.89 | 0.7502 | 66.91 |
| PC14 | 1.0 | 0.25 | 0.8970 | 5.30 | 0.5909 | 0.20 | 0.6749 | 10.87 | 0.6480 | 3.59 | 0.2179 | 15.99 | 0.0491 | 44.16 |
| PC15 | 1.0 | 0.14 | 0.9463 | 4.43 | 0.7277 | 0.46 | 0.5441 | 18.85 | 0.2686 | 0.51 | 0.8141 | 10.85 | 0.2125 | 47.85 |
| PC16 | 0.9 | 0.04 | 0.9829 | 5.43 | 0.6460 | 1.29 | 0.3217 | 19.03 | 0.2936 | 5.09 | 0.1509 | 12.41 | 0.1691 | 49.96 |
| PC17 | 0.9 | 0.17 | 0.9436 | 8.02 | 0.4812 | 0.75 | 0.4722 | 12.92 | 0.6924 | 3.65 | 0.2902 | 6.01 | 0.6508 | 55.78 |
| PC18 | 0.9 | 1.25 | 0.5917 | 11.38 | 0.1696 | 0.23 | 0.6581 | 20.16 | 0.1949 | 1.50 | 0.5344 | 11.18 | 0.1778 | 45.89 |
| PC19 | 0.8 | 0.22 | 0.9221 | 10.08 | 0.3055 | 0.06 | 0.8359 | 12.95 | 0.6487 | 0.52 | 0.8257 | 8.28 | 0.4257 | 52.66 |
| PC20 | 0.8 | 1.03 | 0.7030 | 6.40 | 0.6237 | 0.99 | 0.4136 | 7.50 | 0.9406 | 0.71 | 0.7834 | 6.06 | 0.6529 | 56.47 |
fun_pca_plot <- plotOrd(
fun_pca,
colData(dds),
design = "Site",
shapes = "Season",
axes = c(1, 2),
cbPalette = T,
alpha = 0.75,
) # + facet_wrap(~facet)
# geom_line(aes(group=facet),alpha=0.25,linetype=3,colour="#000000") +
# theme(text = element_text(size=14))
ggsave(filename = "fun_pca_plot.png", plot = fun_pca_plot, path = "figures/")
fun_pca_plot
sum_squares <- apply(mypca$x, 2 ,function(x)
summary(aov(update(formula, x ~ .), data = cbind(x, colData(dds))))[[1]][2]
)
sum_squares <- do.call(cbind, sum_squares)
x <- t(apply(sum_squares, 2, prop.table))
perVar <- x * mypca$percentVar
#colSums(perVar)
round(colSums(perVar) / sum(colSums(perVar)) * 100, 3)
# Site Season Scion Site:Season Site:Scion Season:Scion Site:Season:Scion Residuals
# 38.022 1.000 3.083 4.878 7.803 4.057 8.281 32.876
# Calculate Bray-Curtis distance matrix
vg <- vegdist(t(counts(dds, normalize = T)), method = "bray")
formula <- update(FULL_DESIGN, vg ~ .)
set.seed(sum(utf8ToInt("Hamish McLean")))
result <- adonis2(formula, colData(dds), permutations = 1000)
result
# Permutation test for adonis under reduced model
# Terms added sequentially (first to last)
# Permutation: free
# Number of permutations: 1000
#
# adonis2(formula = formula, data = colData(dds), permutations = 1000)
# Df SumOfSqs R2 F Pr(>F)
# Site 2 6.5423 0.31527 18.6694 0.000999 ***
# Season 1 0.4053 0.01953 2.3133 0.014985 *
# Scion 6 0.9096 0.04383 0.8652 0.769231
# Site:Season 2 0.7206 0.03472 2.0562 0.005994 **
# Site:Scion 12 2.1487 0.10354 1.0219 0.424575
# Season:Scion 6 1.1647 0.05613 1.1079 0.278721
# Site:Season:Scion 12 2.0269 0.09768 0.9640 0.580420
# Residual 39 6.8334 0.32930
# Total 80 20.7514 1.00000
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% data.frame()
df$Perc.Var <- df$SumOfSqs / df["Total", "SumOfSqs"] * 100
df
# Df SumOfSqs R2 F Pr..F. Perc.Var
# Site 2 6.5423049 0.31527021 18.6694186 0.000999001 31.527021
# Season 1 0.4053241 0.01953235 2.3133026 0.014985015 1.953235
# Scion 6 0.9095936 0.04383283 0.8652192 0.769230769 4.383283
# Site:Season 2 0.7205681 0.03472380 2.0562460 0.005994006 3.472380
# Site:Scion 12 2.1486813 0.10354382 1.0219291 0.424575425 10.354382
# Season:Scion 6 1.1646804 0.05612533 1.1078616 0.278721279 5.612533
# Site:Season:Scion 12 2.0269039 0.09767543 0.9640108 0.580419580 9.767543
# Residual 39 6.8333647 0.32929623 NA NA 32.929623
# Total 80 20.7514210 1.00000000 NA NA 100.000000
set.seed(sum(utf8ToInt("Hamish McLean")))
ord <- metaMDS(vg,trace=0)
#sratmax=20000,maxit=20000,try = 177, trymax = 177
fun_nmds <- scores(ord)
fun_nmds_plot <- plotOrd(
fun_nmds, colData(dds),
design = "Site",
shape = "Season",
alpha = 0.75, cbPalette = T
) #+ theme(text = element_text(size = 14))
ggsave(filename = "fun_nmds_plot.png", plot = fun_nmds_plot, path = "figures/")
fun_nmds_plot
# Extract normalised counts from DESeq object
asv_counts <- counts(dds, normalize = T) %>% as.data.frame()
# Sum ASV counts across samples
total_asv_counts <- rowSums(asv_counts)
# Sort ASVs by abundance
total_asv_counts <- total_asv_counts[order(total_asv_counts, decreasing = T)]
# Caculate cumulative percentage
cumulative <- data.frame(
cumulative = cumsum(total_asv_counts) / sum(total_asv_counts) * 100,
no = seq_along(total_asv_counts)
)
# Plot cumulative percentage of ASVs
fun_cum_asv <- ggline(
data = cumulative, x = "no", y = "cumulative",
plot_type = "l", palette = cbPalette,
title = "Cumulative percentage of fungal ASVs", xlab = "Number of ASVs",
ylab = "Cumulative percentage of reads"
)
ggsave(filename = "fun_cum_asv.png", plot = fun_cum_asv, path = "figures/")
fun_cum_asv
# Find the number of ASVs that account for 50%, 80%, and 99% of total reads
cat(
"Number of ASVs that account for 50%, 80%, and 99% of total reads", "\n\n",
"50%:", sum(cumulative <= 50), "\n",
"80%:", sum(cumulative <= 80), "\n",
"99%:", sum(cumulative <= 99), "\n"
)
# Number of ASVs that account for 50%, 80%, and 99% of total reads
#
# 50%: 57
# 80%: 140
# 99%: 741
# Find the cumulative percentage accounted for by top x ASVs
cat(
"Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:", "\n\n",
"100:", round(cumulative[cumulative$no == 100, "cumulative"], 1) , "\n",
"200:", round(cumulative[cumulative$no == 200, "cumulative"], 1) , "\n",
"500:", round(cumulative[cumulative$no == 500, "cumulative"], 1) , "\n"
)
# Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:
#
# 100: 85.9
# 200: 92.6
# 500: 98.1
# Average ASV counts in order
mean_asv_counts <- rowMeans(asv_counts)
mean_asv_counts <- mean_asv_counts[order(mean_asv_counts, decreasing = T)]
# Plot read count distribution
fun_asv_counts <- ggline(
data = data.frame(ASV = seq_along(mean_asv_counts), counts = mean_asv_counts),
x = "ASV", y = "counts", plot_type = "l",
title = "Fungal ASV read count distribution", xlab = "ASV", ylab = "Mean read count"
)
ggsave(filename = "fun_asv_counts.png", plot = fun_asv_counts, path = "figures/")
fun_asv_counts
# Number of ASVs with mean read count > 100, 200, and 500
cat(
"Number of ASVs with mean read count > 100, 200, and 500", "\n\n",
"100:", sum(rowMeans(asv_counts) > 100), "\n",
"200:", sum(rowMeans(asv_counts) > 200), "\n",
"500:", sum(rowMeans(asv_counts) > 500), "\n"
)
# Number of ASVs with mean read count > 100, 200, and 500
#
# 100: 147
# 200: 86
# 500: 49
# Filter the top x abundant OTUs by the sum of their normalised counts
# top_asvs <- asv_counts[order(rowSums(asv_counts), decreasing = T)[1:DIFFOTU], ]
# Filter ASVs with mean read count > 100
top_asvs <- asv_counts[rowMeans(asv_counts) > 100, ]
# Check that sample names match
identical(names(top_asvs), rownames(colData))
# [1] TRUE
# Extract taxonomic data for top OTUs
top_taxa <- taxData[rownames(top_asvs), ]
# Log transform normalised counts
# top_asvs <- log10(top_asvs + 1)
top_asv_data <- data.frame(t(top_asvs))
top_asv_ids <- rownames(top_asvs)
# Check that sample names match
identical(rownames(top_asv_data), rownames(colData))
# [1] TRUE
# Add sample metadata to top OTU data
top_asv_data <- merge(top_asv_data, colData, by = 0) %>% column_to_rownames("Row.names")
Effect of Site, Scion, and planting season on abundance of top 200 ASVs
# ANOVA of top ASVs
otu_lm_anova <- function(otu, formula, data) {
f = update(formula, paste0("log(", otu, " + 1) ~ ."))
a = aov(f, data = data) %>% summary() %>% unclass() %>% data.frame()
total_var = sum(a$Sum.Sq)
d = data.frame(
OTU = otu,
Taxonomy = taxData[otu, "rank"],
site_var = a['Site ', 'Sum.Sq'] / total_var * 100,
site_p = a['Site ', 'Pr..F.'],
scion_var = a['Scion', 'Sum.Sq'] / total_var * 100,
scion_p = a['Scion', 'Pr..F.'],
season_var = a['Season ', 'Sum.Sq'] / total_var * 100,
season_p = a['Season ', 'Pr..F.'],
site_scion_var = a['Site:Scion', 'Sum.Sq'] / total_var * 100,
site_scion_p = a['Site:Scion', 'Pr..F.'],
site_season_var = a['Site:Season ', 'Sum.Sq'] / total_var * 100,
site_season_p = a['Site:Season ', 'Pr..F.'],
season_scion_var = a['Season:Scion', 'Sum.Sq'] / total_var * 100,
season_scion_p = a['Season:Scion', 'Pr..F.'],
residuals_var = a['Residuals', 'Sum.Sq'] / total_var * 100
)
return(d)
}
# Negative binomial regression model
otu_negbin_anova <- function(otu, formula, data) {
f = update(formula, paste0(otu, " ~ ."))
m = glm.nb(f, data = data)
a = anova(m, test = "Chisq") %>% data.frame()
total_deviance = sum(a$Deviance, na.rm = T) + tail(a$Resid..Dev, 1)
d = data.frame(
OTU = otu,
Taxonomy = taxData[otu, "rank"],
site_var = a['Site', 'Deviance'] / total_deviance * 100,
site_p = a['Site', 'Pr..Chi.'],
cultivar_var = a['Scion', 'Deviance'] / total_deviance * 100,
cultivar_p = a['Scion', 'Pr..Chi.'],
season_var = a['Season', 'Deviance'] / total_deviance * 100,
season_p = a['Season', 'Pr..Chi.']
)
return(d)
}
formula <- FULL_DESIGN
# Full design model does not converge
# formula <- y ~ site + Scion + Season + site:Scion + site:Season
# formula <- DESIGN
# otu_lm_anova(top_asv_ids[1], formula, top_asv_data)
otu_anova_results <- sapply(
top_asv_ids, function(x) otu_lm_anova(x, formula, top_asv_data)
) %>% t() %>% data.table()
otu_anova_results_adjusted <- otu_anova_results %>%
mutate(
site_p = p.adjust(site_p, method = "BH"),
scion_p = p.adjust(scion_p, method = "BH"),
season_p = p.adjust(season_p, method = "BH"),
site_scion_p = p.adjust(site_scion_p, method = "BH"),
site_season_p = p.adjust(site_season_p, method = "BH"),
season_scion_p = p.adjust(season_scion_p, method = "BH")
)
cat(
"Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values", "\n\n",
"Site:", nrow(otu_anova_results_adjusted[site_p < 0.05, ]), "\n",
"Scion:", nrow(otu_anova_results_adjusted[scion_p < 0.05, ]), "\n",
"Season:", nrow(otu_anova_results_adjusted[season_p < 0.05, ]), "\n",
"Site:Scion:", nrow(otu_anova_results_adjusted[site_scion_p < 0.05, ]), "\n",
"Site:Season:", nrow(otu_anova_results_adjusted[site_season_p < 0.05, ]), "\n",
"Season:Scion:", nrow(otu_anova_results_adjusted[season_scion_p < 0.05, ]), "\n\n"
)
# Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values
#
# Site: 136
# Scion: 0
# Season: 1
# Site:Scion: 0
# Site:Season: 36
# Season:Scion: 0
otu_anova_results_adjusted[scion_p < 0.05, ]
# Empty data.table (0 rows and 15 cols): OTU,Taxonomy,site_var,site_p,scion_var,scion_p...
otu_anova_results_adjusted[site_scion_p < 0.05, ]
# Empty data.table (0 rows and 15 cols): OTU,Taxonomy,site_var,site_p,scion_var,scion_p...
otu_anova_results_adjusted[season_scion_p < 0.05, ]
# Empty data.table (0 rows and 15 cols): OTU,Taxonomy,site_var,site_p,scion_var,scion_p...
Testing the effects of of total abundance, ASV abundance, α-diversity, and β-diversity on canker counts.
This uses a nested negative binomial regression model.
The base model for canker counts uses the formula: Cankers ~ Site * Season * Scion.
# Filter out samples with missing canker count
canker_abundance_data <- colData[complete.cases(colData$Cankers), ]
# Base model
canker_design = "Cankers ~ Site * Season * Scion"
base_model <- glm.nb(canker_design, data = canker_abundance_data)
# Abundance model
abundance_design = paste(canker_design, "+ log(copy_number)")
abundance_model <- glm.nb(abundance_design, data = canker_abundance_data)
# ANOVA of abundance with canker count
kable(anova(base_model, abundance_model))
| Model | theta | Resid. df | 2 x log-lik. | Test | df | LR stat. | Pr(Chi) |
|---|---|---|---|---|---|---|---|
| Site * Season * Scion | 2.828262 | 32 | -546.8670 | NA | NA | NA | |
| Site * Season * Scion + log(copy_number) | 2.831854 | 31 | -546.8376 | 1 vs 2 | 1 | 0.0293702 | 0.863927 |
# Filter out samples with missing canker count
canker_top_asv_data <- top_asv_data[complete.cases(top_asv_data$Cankers), ]
# all.equal(top_asv_data[c("Site", "Season", "Scion", "Cankers")],cankers)
# Base model design
canker_design = "Cankers ~ Site * Season * Scion"
# Base model with ASV abundance data
base_model <- glm.nb(canker_design, data = canker_top_asv_data)
# ANOVA of top ASVs with canker count
asv_canker_anova <- function(otu, design, base_model, data) {
log_otu = paste0("log(", otu, " + 1)")
f = paste(design, "+", log_otu)#, "+", log_otu, ":Site")
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = suppressWarnings(anova(m)) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.frame(
OTU = otu,
Taxonomy = taxData[otu, "rank"],
coef = m$coefficients[log_otu],
var = b[log_otu, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
kable(asv_canker_anova(top_asv_ids[1], canker_design, base_model, canker_top_asv_data))
| OTU | Taxonomy | coef | var | p | |
|---|---|---|---|---|---|
| log(OTU1 + 1) | OTU1 | Dactylonectria macrodidyma(s) | 0.0184892 | 3.707987 | 0.9016206 |
# Effect of ASV abundance on canker count for top ASVs
asv_canker_results <- sapply(
top_asv_ids,
function(x) asv_canker_anova(x, canker_design, base_model, canker_top_asv_data)
) %>% t() %>% data.table()
# Adjust p-values for multiple testing
asv_canker_results$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")
# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
nrow(asv_canker_results[p_adjusted < 0.05, ]), "of", nrow(asv_canker_results),
"ASVs have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 0 of 147 ASVs have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(asv_canker_results[p_adjusted < 0.05, ]) > 0) {
asv_canker_results[p_adjusted < 0.05, ]
}
site <- 3
canker_site_design <- "Cankers ~ Season * Scion"
# Base model with ASV abundance data
base_model_1 <- glm.nb(canker_site_design, data = canker_top_asv_data[canker_top_asv_data$Site == site,])
# Effect of ASV abundance on canker count for top ASVs
asv_canker_results_1 <- sapply(
top_asv_ids, function(x) asv_canker_anova(
x, canker_site_design, base_model_1,
canker_top_asv_data[canker_top_asv_data$Site == site, ]
)
) %>% t() %>% data.table()
# Adjust p-values for multiple testing
asv_canker_results_1$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")
# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
nrow(asv_canker_results_1[p_adjusted < 0.05, ]), "of", nrow(asv_canker_results_1),
"ASVs have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 0 of 147 ASVs have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(asv_canker_results_1[p_adjusted < 0.05, ]) > 0) {
asv_canker_results_1[p_adjusted < 0.05, ]
}
# Add genus from taxData to countData
fun_genus_data <- counts(dds, normalize = T) %>% as.data.frame() %>% mutate(
genus = taxData[rownames(countData), "genus"]
)
# Group by genus
fun_genus_data <- fun_genus_data %>% group_by(genus) %>% summarise_all(sum) %>% as.data.frame()
# Set rownames as genus
rownames(fun_genus_data) <- fun_genus_data$genus
fun_genus_data <- dplyr::select(fun_genus_data, -genus)
# Filter genera with mean abundance < 100
fun_genus_data <- fun_genus_data[rowMeans(fun_genus_data) > 10, ]
# Rank not genus
not_genus <- rownames(fun_genus_data)[grep("\\([a-z]\\)", rownames(fun_genus_data))]
# Remove rows with genus in not_genus
fun_genus_data <- fun_genus_data[!rownames(fun_genus_data) %in% not_genus, ]
cat(
length(not_genus), "non-genus ranks removed:\n\n",
not_genus, "\n"
)
# 1 non-genus ranks removed:
#
# Clavicipitaceae(f)
# Final genus list
fun_genera <- rownames(fun_genus_data)
# Transpose and add metadata from colData
fun_genus_data <- t(fun_genus_data) %>% as.data.frame() %>% mutate(
Site = colData[rownames(.), "Site"],
Season = colData[rownames(.), "Season"],
Scion = colData[rownames(.), "Scion"],
Cankers = colData[rownames(.), "Cankers"]
)
# Filter out samples with missing canker count
fun_genus_data <- fun_genus_data[complete.cases(fun_genus_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Season * Scion"
# Base model with genus abundance data
base_model <- glm.nb(canker_design, data = fun_genus_data)
# ANOVA of genus abundance with canker count
genus_canker_anova <- function(genus, design, base_model, data) {
log_genus = paste0("log(", genus, " + 1)")
f = paste(design, "+", log_genus)#, "+", log_genus, ":Site")
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = suppressWarnings(anova(m)) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.table(
Genus = genus,
Abundance = mean(data[[genus]]),
coef = m$coefficients[log_genus],
var = b[log_genus, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# genus_canker_anova(fun_genera[1], canker_design, base_model, fun_genus_data)
# Effect of genera abundance on canker counts
genus_canker_results <- sapply(
fun_genera,
function(x) genus_canker_anova(x, canker_design, base_model, fun_genus_data)
) %>% t() %>% data.table()
# Adjust p-values for multiple testing
genus_canker_results$p_adjusted <- p.adjust(genus_canker_results$p, method = "BH")
# Summary of genera with statistically significant (*P* < 0.05) adjusted p-values
cat(
nrow(genus_canker_results[p_adjusted < 0.05, ]), "of", nrow(genus_canker_results),
"genera have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 3 of 197 genera have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(genus_canker_results[p_adjusted < 0.05, ]) > 0) {
genus_canker_results[p_adjusted < 0.05, ]
}
# Genus Abundance coef var p p_adjusted
# 1: Corynascus 29.95441 0.4553139 0.06740052 0.0004488953 0.0337803378
# 2: Juxtiphoma 10.55033 0.5576448 3.793486 3.410562e-06 0.0006718808
# 3: Lepiota 26.4591 0.3081053 0.2728229 0.0005144214 0.0337803378
# ANOVA of α-diversity with canker count
# Base model with α-diversity data
base_model <- glm.nb(canker_design, data = all_alpha_ord)
measures <- c("S.chao1", "shannon", "simpson")
# ANOVA of α-diversity with canker count
alpha_canker_anova <- function(measure, data) {
f = paste(canker_design, "+", measure)
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = anova(m) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.frame(
measure = measure,
coef = m$coefficients[measure],
var = b[measure, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# alpha_canker_anova("shannon", all_alpha_ord)
# Effect of α-diversity on canker count for each measure
alpha_canker_results <- data.table(t(sapply(measures, function(x) alpha_canker_anova(x, all_alpha_ord))))
alpha_canker_results
# measure coef var p
# 1: S.chao1 0.003337705 0.03842385 0.08796031
# 2: shannon -0.08300265 0.2109241 0.7883279
# 3: simpson -4.031448 1.819761 0.04187015
no_pcs <- 4
# Merge PC scores with canker data
pc_scores <- merge(colData, data.frame(mypca$x[, 1:no_pcs]), by = "row.names") %>%
column_to_rownames("Row.names")
pcs <- tail(colnames(pc_scores), no_pcs)
# Base model with β-diversity data
base_model <- glm.nb(canker_design, data = pc_scores)
# ANOVA of β-diversity with canker count
beta_canker_anova <- function(pc, data) {
f = paste0(canker_design, "+", pc)
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = anova(m) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.frame(
PC = pc,
coef = m$coefficients[pc],
var = b[pc, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# Effect of β-diversity on canker count for each PC
beta_canker_results <- data.table(t(sapply(pcs, function(x) beta_canker_anova(x, pc_scores))))
beta_canker_results
# PC coef var p
# 1: PC1 0.01281376 4.654705 0.3209296
# 2: PC2 -0.002516435 0.05130682 0.7117345
# 3: PC3 0.0119703 0.1146305 0.108615
# 4: PC4 0.02633142 1.147527 0.06763427
# Save environment
save.image("FUN.RData")
# Unpack bacteria data
invisible(mapply(assign, names(ubiome_BAC), ubiome_BAC, MoreArgs = list(envir = globalenv())))
cat(
"Raw reads", "\n\n",
"Total raw reads:\t\t", sum(countData), "\n",
"Mean raw reads per sample:\t", mean(colSums(countData)), "\n",
"Median raw reads per sample:\t", median(colSums(countData)), "\n",
"Max raw reads per sample:\t", max(colSums(countData)), "\n",
"Min raw reads per sample:\t", min(colSums(countData)), "\n\n"
)
# Raw reads
#
# Total raw reads: 3365816
# Mean raw reads per sample: 41046.54
# Median raw reads per sample: 40406.5
# Max raw reads per sample: 89023
# Min raw reads per sample: 15049
#colSums(countData)
nct <- counts(dds, normalize = T)
cat("Normalised reads", "\n\n",
"Total normalised reads:\t\t", sum(nct), "\n",
"Mean normalised reads per sample:\t", mean(colSums(nct)), "\n",
"Median normalised reads per sample:\t", median(colSums(nct)), "\n",
"Min normalised reads per sample:\t", min(colSums(nct)), "\n",
"Max normalised reads per sample:\t", max(colSums(nct)), "\n\n"
)
# Normalised reads
#
# Total normalised reads: 4585124
# Mean normalised reads per sample: 55916.14
# Median normalised reads per sample: 53407.32
# Min normalised reads per sample: 9940.045
# Max normalised reads per sample: 139399.1
#round(colSums(counts(dds,normalize = T)),0)
cat(
"Total OTUs:\t\t", nrow(taxData),"\n\n",
"Raw reads per OTU summary", "\n\n",
"Mean raw reads per OTU:\t", mean(rowSums(countData)),"\n",
"Median raw per OTU:\t\t", median(rowSums(countData)),"\n",
"OTU raw Min reads:\t\t", min(rowSums(countData)),"\n",
"OTU raw Max reads:\t\t", max(rowSums(countData)),"\n\n"
)
# Total OTUs: 5883
#
# Raw reads per OTU summary
#
# Mean raw reads per OTU: 572.1258
# Median raw per OTU: 120
# OTU raw Min reads: 34
# OTU raw Max reads: 106398
cat(
"Normalised reads per OTU summary","\n\n",
"Mean normalised reads per OTU:\t\t", mean(rowSums(nct)),"\n",
"Median normalised reads per OTU:\t", median(rowSums(nct)),"\n",
"OTU normalised Min reads:\t\t", min(rowSums(nct)),"\n",
"OTU normalised Max reads:\t\t", max(rowSums(nct)),"\n\n"
)
# Normalised reads per OTU summary
#
# Mean normalised reads per OTU: 779.3853
# Median normalised reads per OTU: 156.6744
# OTU normalised Min reads: 20.59746
# OTU normalised Max reads: 139400.3
y <- rowSums(nct)
y <- y[order(y, decreasing = T)]
# proportion
xy <- y/sum(y)
cat("Top ", TOPOTU, "OTUs:\n")
# Top 10 OTUs:
data.frame(counts = y[1:TOPOTU], proportion = xy[1:TOPOTU], rank = taxData[names(y)[1:TOPOTU],]$rank)
# counts proportion rank
# OTU1 139400.30 0.030402734 Streptomyces(g)
# OTU2 124366.87 0.027123994 Kineosporiales(o)
# OTU3 113814.51 0.024822559 Kineosporiaceae(f)
# OTU4 99723.88 0.021749441 Streptomyces(g)
# OTU5 96979.25 0.021150846 Streptomyces(g)
# OTU6 79618.94 0.017364621 Streptomyces(g)
# OTU7 51308.93 0.011190305 Bradyrhizobium(g)
# OTU8 49535.82 0.010803594 Actinoplanes(g)
# OTU20 33817.55 0.007375494 Actinobacteria(c)
# OTU9 33580.42 0.007323775 Nonomuraea(g)
Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank
# Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank
tx <- copy(taxData)
setDT(tx)
cols <- names(tx)[9:15]
tx[, (cols) := lapply(.SD, as.factor), .SDcols = cols]
data.table(
rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
"0.8" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.8) / nrow(tx))), 2),
"0.65" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.65) / nrow(tx))), 2),
"0.5" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.5) / nrow(tx))), 2)
)
# rank 0.8 0.65 0.5
# 1: kingdom 1.00 1.00 1.00
# 2: phylum 0.94 0.97 0.99
# 3: class 0.84 0.90 0.93
# 4: order 0.65 0.72 0.79
# 5: family 0.51 0.57 0.64
# 6: genus 0.35 0.43 0.51
# 7: species 0.00 0.00 0.00
% of reads which can be assigned to each taxonomic ranks
tx <-taxData[rownames(dds),]
nc <- counts(dds, normalize = T)
ac <- sum(nc)
data.table(
rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
"0.8" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.8),]) / ac * 100))), 2),
"0.65" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.65),]) / ac * 100))), 2),
"0.5" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.5),]) / ac * 100))), 2)
)
# rank 0.8 0.65 0.5
# 1: kingdom 99.95 100.00 100.00
# 2: phylum 97.78 99.43 99.75
# 3: class 92.63 95.82 98.05
# 4: order 71.84 81.25 87.03
# 5: family 58.63 67.80 73.01
# 6: genus 44.35 51.20 58.33
# 7: species 0.00 0.00 0.00
abundance_plot <- ggplot(
data = as.data.frame(colData(dds)),
aes(x = Site, y = log_copy_number, colour = Scion, shape = Season)
) + geom_jitter() +
scale_colour_manual(values = cbPalette)
abundance_plot <- ggboxplot(
data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number",
color = "Scion", add = "jitter", legend = "top",
title = "Bacterial abundance", xlab = "Site", ylab = "log10 copy number"
)
ggsave(
filename = "bac_abundance.png", plot = abundance_plot, path = "figures/",
height = 20, width = 20, units = "cm"
)
abundance_plot
# Formula for ANOVA
formula <- update(FULL_DESIGN, log_copy_number ~ .)
abundance_anova <- aov(formula, data = as.data.frame(colData(dds)))
# Normality check
par(mfrow = c(2, 2))
plot(abundance_anova)
png("figures/fun_abundance_norm.png", width = 800, height = 600)
par(mfrow = c(2, 2))
plot(abundance_anova)
dev.off()
# png
# 2
# Results
summary(abundance_anova)
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 1.9657 0.9828 25.302 7.91e-08 ***
# Season 1 0.0798 0.0798 2.056 0.1594
# Scion 6 0.5131 0.0855 2.202 0.0628 .
# Site:Season 2 0.0768 0.0384 0.989 0.3809
# Site:Scion 12 0.2677 0.0223 0.574 0.8494
# Season:Scion 6 0.1640 0.0273 0.704 0.6484
# Site:Season:Scion 12 0.1889 0.0157 0.405 0.9530
# Residuals 40 1.5538 0.0388
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abundance_results <- abundance_anova %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(abundance_results$Sum.Sq)
abundance_results$Perc.Var <- abundance_results$Sum.Sq / total_variance * 100
abundance_results
# Df Sum.Sq Mean.Sq F.value Pr..F. Perc.Var
# Site 2 1.96566003 0.98283002 25.3015018 7.913123e-08 40.867065
# Season 1 0.07984549 0.07984549 2.0555037 1.594283e-01 1.660028
# Scion 6 0.51312816 0.08552136 2.2016206 6.280900e-02 10.668194
# Site:Season 2 0.07683263 0.03841631 0.9889711 3.808702e-01 1.597389
# Site:Scion 12 0.26772071 0.02231006 0.5743394 8.494197e-01 5.566049
# Season:Scion 6 0.16398521 0.02733087 0.7035927 6.484006e-01 3.409335
# Site:Season:Scion 12 0.18892649 0.01574387 0.4053027 9.530223e-01 3.927877
# Residuals 40 1.55378922 0.03884473 NA NA 32.304063
# plot alpha diversity - plot_alpha will convert normalised abundances to integer values
bac_alpha_plot <- plot_alpha(
counts(dds,normalize = F), colData(dds),
design = "Scion", colour = "Site",
measures = c("Shannon", "Simpson"),
type="box"
) +
scale_colour_manual(values = cbPalette) +
theme(axis.title.x = element_blank()) +
ggtitle("Bacterial α-diversity")
abundance_plot <- ggboxplot(
data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number",
color = "Scion", add = "jitter", legend = "top",
title = "Fungal abundance", xlab = "Site", ylab = "log10 copy number"
)
ggsave(
filename = "bac_alpha.png", plot = bac_alpha_plot, path = "figures/",
height = 20, width = 40, units = "cm"
)
bac_alpha_plot
# get the diversity index data
all_alpha_ord <- plot_alpha(
counts(dds, normalize = F), colData(dds), design = "Site", returnData = T
)
# join diversity indices and metadata
all_alpha_ord <- all_alpha_ord[
as.data.table(colData(dds), keep.rownames = "Samples"), on = "Samples"
]
bac_alpha <- all_alpha_ord
formula <- FULL_DESIGN
setkey(all_alpha_ord, S.chao1)
all_alpha_ord[, measure := as.numeric(as.factor(S.chao1))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings: sequential SS "
summary(result)
# Component 1 :
# Df R Sum Sq R Mean Sq Iter Pr(Prob)
# Site 2 20914.4 10457.2 5000 <2e-16 ***
# Season 1 113.5 113.5 110 0.4818
# Site:Season 2 3319.8 1659.9 5000 0.0094 **
# Scion 6 1291.7 215.3 407 0.5037
# Site:Scion 12 5425.8 452.1 2201 0.1577
# Season:Scion 6 867.4 144.6 699 0.8054
# Site:Season:Scion 12 1912.4 159.4 551 0.9456
# Residuals 40 12095.5 302.4
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
# Df R.Sum.Sq R.Mean.Sq Iter Pr.Prob. Perc.Var
# Site 2 20914.3677 10457.1839 5000 0.0000000 45.5249023
# Season 1 113.5359 113.5359 110 0.4818182 0.2471368
# Site:Season 2 3319.8162 1659.9081 5000 0.0094000 7.2263388
# Scion 6 1291.7022 215.2837 407 0.5036855 2.8116852
# Site:Scion 12 5425.7655 452.1471 2201 0.1576556 11.8104189
# Season:Scion 6 867.3693 144.5615 699 0.8054363 1.8880275
# Site:Season:Scion 12 1912.4432 159.3703 551 0.9455535 4.1628699
# Residuals 40 12095.5000 302.3875 NA NA 26.3286207
setkey(all_alpha_ord, shannon)
all_alpha_ord[, measure := as.numeric(as.factor(shannon))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings: sequential SS "
summary(result)
# Component 1 :
# Df R Sum Sq R Mean Sq Iter Pr(Prob)
# Site 2 23583.2 11791.6 5000 <2e-16 ***
# Season 1 89.3 89.3 120 0.4583
# Site:Season 2 898.4 449.2 714 0.2983
# Scion 6 994.0 165.7 165 0.7939
# Site:Scion 12 3435.7 286.3 5000 0.7100
# Season:Scion 6 481.5 80.3 84 1.0000
# Site:Season:Scion 12 2143.3 178.6 2635 0.9450
# Residuals 40 14315.0 357.9
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
# Df R.Sum.Sq R.Mean.Sq Iter Pr.Prob. Perc.Var
# Site 2 23583.23942 11791.61971 5000 0.0000000 51.3343116
# Season 1 89.26865 89.26865 120 0.4583333 0.1943136
# Site:Season 2 898.39852 449.19926 714 0.2983193 1.9555698
# Scion 6 994.03480 165.67247 165 0.7939394 2.1637440
# Site:Scion 12 3435.70147 286.30846 5000 0.7100000 7.4785896
# Season:Scion 6 481.54603 80.25767 84 1.0000000 1.0481950
# Site:Season:Scion 12 2143.31111 178.60926 2635 0.9449715 4.6654066
# Residuals 40 14315.00000 357.87500 NA NA 31.1598698
setkey(all_alpha_ord, simpson)
all_alpha_ord[, measure := as.numeric(as.factor(simpson))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings: sequential SS "
summary(result)
# Component 1 :
# Df R Sum Sq R Mean Sq Iter Pr(Prob)
# Site 2 20851.3 10425.7 5000 <2e-16 ***
# Season 1 31.0 31.0 163 0.3804
# Site:Season 2 1422.3 711.2 1690 0.1118
# Scion 6 1995.8 332.6 609 0.3924
# Site:Scion 12 3933.8 327.8 1536 0.4915
# Season:Scion 6 1003.5 167.3 677 0.5968
# Site:Season:Scion 12 3012.3 251.0 3909 0.7135
# Residuals 40 13690.5 342.3
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
# Df R.Sum.Sq R.Mean.Sq Iter Pr.Prob. Perc.Var
# Site 2 20851.30423 10425.65212 5000 0.0000000 45.38763016
# Season 1 30.96912 30.96912 163 0.3803681 0.06741138
# Site:Season 2 1422.32005 711.16002 1690 0.1118343 3.09600472
# Scion 6 1995.82345 332.63724 609 0.3924466 4.34436598
# Site:Scion 12 3933.80635 327.81720 1536 0.4915365 8.56282877
# Season:Scion 6 1003.51689 167.25281 677 0.5967504 2.18438391
# Site:Season:Scion 12 3012.25990 251.02166 3909 0.7134817 6.55687225
# Residuals 40 13690.50000 342.26250 NA NA 29.80050282
# Number of PCs to include
n_pcs <- 20
# perform PC decomposition of DES object
mypca <- des_to_pca(dds)
# to get pca plot axis into the same scale create a dataframe of PC scores multiplied by their variance
bac_pca <- t(data.frame(t(mypca$x) * mypca$percentVar))
formula = FULL_DESIGN
# Cumulative percentage of variance explained
pca_cum_var <- data.frame(
cumulative = cumsum(mypca$percentVar * 100),
no = 1:length(mypca$percentVar)
)
# Plot cumulative percentage of variance explained
bac_cum_pca <- ggline(
pca_cum_var, x = "no", y = "cumulative", plot_type = "l",
xlab = "Number of PCs", ylab = "Cumulative % variance explained",
title = "Bacteria: cumulative % variance explained by PCs"
)
ggsave(filename = "bac_cum_pca.png", plot = bac_cum_pca, path = "figures/",)
bac_cum_pca
pca_var <- data.frame(
row.names = paste0("PC", 1:n_pcs),
perc_var = round(mypca$percentVar[1:n_pcs] * 100, 1)
)
pca_var
# perc_var
# PC1 18.6
# PC2 12.7
# PC3 7.1
# PC4 4.0
# PC5 3.0
# PC6 2.1
# PC7 1.8
# PC8 1.6
# PC9 1.3
# PC10 1.3
# PC11 1.3
# PC12 1.2
# PC13 1.2
# PC14 1.1
# PC15 1.1
# PC16 1.1
# PC17 1.0
# PC18 1.0
# PC19 1.0
# PC20 1.0
pca_summary <- apply(
mypca$x[, 1:n_pcs], 2,
function(x){
summary(aov(update(formula, x ~ .), data = as.data.frame(cbind(x, colData(dds)))))
}
)
pca_summary
# $PC1
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 190646 95323 224.072 <2e-16 ***
# Season 1 25 25 0.058 0.8106
# Scion 6 4227 704 1.656 0.1572
# Site:Season 2 2566 1283 3.015 0.0603 .
# Site:Scion 12 8401 700 1.646 0.1176
# Season:Scion 6 1527 255 0.598 0.7298
# Site:Season:Scion 12 5124 427 1.004 0.4634
# Residuals 40 17016 425
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC2
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 137668 68834 276.644 <2e-16 ***
# Season 1 142 142 0.572 0.454
# Scion 6 1474 246 0.987 0.447
# Site:Season 2 604 302 1.213 0.308
# Site:Scion 12 4425 369 1.482 0.172
# Season:Scion 6 711 119 0.476 0.822
# Site:Season:Scion 12 2035 170 0.681 0.759
# Residuals 40 9953 249
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC3
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 2673 1337 1.562 0.222149
# Season 1 30 30 0.035 0.851920
# Scion 6 13665 2278 2.663 0.028714 *
# Site:Season 2 14748 7374 8.621 0.000771 ***
# Site:Scion 12 10740 895 1.046 0.428331
# Season:Scion 6 8140 1357 1.586 0.176467
# Site:Season:Scion 12 2870 239 0.280 0.989466
# Residuals 40 34217 855
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC4
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 5281 2640.3 5.834 0.00598 **
# Season 1 635 634.6 1.402 0.24336
# Scion 6 3161 526.9 1.164 0.34456
# Site:Season 2 630 315.1 0.696 0.50442
# Site:Scion 12 13692 1141.0 2.521 0.01418 *
# Season:Scion 6 1827 304.5 0.673 0.67221
# Site:Season:Scion 12 6439 536.6 1.186 0.32589
# Residuals 40 18104 452.6
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC5
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 5 2 0.006 0.99447
# Season 1 4033 4033 9.433 0.00382 **
# Scion 6 1458 243 0.568 0.75299
# Site:Season 2 5967 2984 6.979 0.00251 **
# Site:Scion 12 2582 215 0.503 0.90010
# Season:Scion 6 1984 331 0.773 0.59554
# Site:Season:Scion 12 3480 290 0.678 0.76149
# Residuals 40 17102 428
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC6
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 32 16.2 0.055 0.9467
# Season 1 1498 1497.8 5.070 0.0299 *
# Scion 6 4754 792.3 2.682 0.0278 *
# Site:Season 2 811 405.3 1.372 0.2653
# Site:Scion 12 2040 170.0 0.575 0.8486
# Season:Scion 6 1724 287.4 0.973 0.4560
# Site:Season:Scion 12 3166 263.8 0.893 0.5611
# Residuals 40 11817 295.4
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC7
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 85 42.7 0.157 0.8556
# Season 1 448 448.2 1.645 0.2070
# Scion 6 2285 380.9 1.398 0.2393
# Site:Season 2 2000 999.9 3.670 0.0344 *
# Site:Scion 12 2462 205.1 0.753 0.6925
# Season:Scion 6 983 163.9 0.602 0.7273
# Site:Season:Scion 12 3081 256.7 0.942 0.5166
# Residuals 40 10898 272.4
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC8
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 110 55.1 0.197 0.822
# Season 1 549 548.5 1.963 0.169
# Scion 6 750 125.1 0.448 0.842
# Site:Season 2 334 167.1 0.598 0.555
# Site:Scion 12 1153 96.1 0.344 0.975
# Season:Scion 6 1856 309.4 1.108 0.375
# Site:Season:Scion 12 3233 269.4 0.964 0.497
# Residuals 40 11175 279.4
#
# $PC9
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 291 145.4 0.624 0.54101
# Season 1 120 120.0 0.515 0.47731
# Scion 6 658 109.6 0.470 0.82619
# Site:Season 2 3474 1737.2 7.452 0.00177 **
# Site:Scion 12 1215 101.2 0.434 0.93963
# Season:Scion 6 606 101.0 0.433 0.85225
# Site:Season:Scion 12 970 80.9 0.347 0.97417
# Residuals 40 9324 233.1
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC10
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 131 65.4 0.396 0.67572
# Season 1 365 365.5 2.213 0.14466
# Scion 6 1060 176.7 1.070 0.39635
# Site:Season 2 2202 1100.8 6.666 0.00317 **
# Site:Scion 12 2043 170.2 1.031 0.44086
# Season:Scion 6 1947 324.4 1.965 0.09381 .
# Site:Season:Scion 12 1980 165.0 0.999 0.46711
# Residuals 40 6605 165.1
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC11
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 15 7.62 0.030 0.970
# Season 1 84 84.49 0.332 0.568
# Scion 6 1487 247.76 0.974 0.455
# Site:Season 2 117 58.45 0.230 0.796
# Site:Scion 12 1745 145.45 0.572 0.851
# Season:Scion 6 838 139.64 0.549 0.768
# Site:Season:Scion 12 1543 128.57 0.506 0.899
# Residuals 40 10172 254.30
#
# $PC12
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 28 13.98 0.075 0.928
# Season 1 58 57.94 0.310 0.581
# Scion 6 1149 191.52 1.025 0.423
# Site:Season 2 474 237.00 1.268 0.292
# Site:Scion 12 1493 124.43 0.666 0.773
# Season:Scion 6 1331 221.83 1.187 0.333
# Site:Season:Scion 12 2905 242.10 1.296 0.259
# Residuals 40 7474 186.84
#
# $PC13
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 81 40.5 0.255 0.7763
# Season 1 309 308.7 1.945 0.1708
# Scion 6 635 105.8 0.666 0.6771
# Site:Season 2 794 397.2 2.502 0.0947 .
# Site:Scion 12 4123 343.6 2.164 0.0338 *
# Season:Scion 6 828 138.0 0.869 0.5258
# Site:Season:Scion 12 1113 92.8 0.584 0.8416
# Residuals 40 6350 158.8
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC14
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 90 44.9 0.296 0.7457
# Season 1 13 13.1 0.086 0.7707
# Scion 6 536 89.3 0.587 0.7382
# Site:Season 2 1555 777.6 5.113 0.0105 *
# Site:Scion 12 1200 100.0 0.657 0.7802
# Season:Scion 6 1495 249.2 1.638 0.1619
# Site:Season:Scion 12 2975 247.9 1.630 0.1219
# Residuals 40 6084 152.1
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC15
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 36 18.17 0.098 0.907
# Season 1 9 8.77 0.047 0.829
# Scion 6 1477 246.09 1.330 0.267
# Site:Season 2 469 234.74 1.269 0.292
# Site:Scion 12 1713 142.75 0.771 0.675
# Season:Scion 6 753 125.46 0.678 0.668
# Site:Season:Scion 12 1654 137.80 0.745 0.700
# Residuals 40 7402 185.04
#
# $PC16
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 28 14.1 0.079 0.9242
# Season 1 1096 1095.9 6.138 0.0175 *
# Scion 6 475 79.2 0.444 0.8451
# Site:Season 2 815 407.7 2.284 0.1150
# Site:Scion 12 534 44.5 0.249 0.9936
# Season:Scion 6 377 62.9 0.352 0.9045
# Site:Season:Scion 12 2909 242.4 1.358 0.2263
# Residuals 40 7141 178.5
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC17
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 1 0.29 0.002 0.998
# Season 1 38 37.87 0.231 0.634
# Scion 6 683 113.87 0.694 0.656
# Site:Season 2 315 157.64 0.961 0.391
# Site:Scion 12 1540 128.30 0.782 0.665
# Season:Scion 6 1359 226.52 1.380 0.246
# Site:Season:Scion 12 2442 203.46 1.240 0.291
# Residuals 40 6564 164.09
#
# $PC18
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 72 36.1 0.277 0.7599
# Season 1 168 168.2 1.290 0.2628
# Scion 6 851 141.8 1.087 0.3868
# Site:Season 2 271 135.6 1.040 0.3629
# Site:Scion 12 2714 226.2 1.734 0.0955 .
# Season:Scion 6 1954 325.6 2.496 0.0381 *
# Site:Season:Scion 12 1523 126.9 0.973 0.4897
# Residuals 40 5217 130.4
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# $PC19
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 61 30.31 0.191 0.827
# Season 1 16 16.12 0.101 0.752
# Scion 6 977 162.91 1.026 0.423
# Site:Season 2 170 84.95 0.535 0.590
# Site:Scion 12 2170 180.83 1.138 0.358
# Season:Scion 6 799 133.10 0.838 0.548
# Site:Season:Scion 12 1504 125.30 0.789 0.659
# Residuals 40 6354 158.85
#
# $PC20
# Df Sum Sq Mean Sq F value Pr(>F)
# Site 2 16 8.0 0.075 0.92807
# Season 1 150 150.0 1.397 0.24423
# Scion 6 2350 391.6 3.647 0.00557 **
# Site:Season 2 155 77.5 0.721 0.49224
# Site:Scion 12 1810 150.8 1.405 0.20414
# Season:Scion 6 976 162.6 1.515 0.19831
# Site:Season:Scion 12 2139 178.2 1.660 0.11382
# Residuals 40 4296 107.4
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcas <- lapply(pca_summary, function(i) data.frame(unclass(i)))
pca_factors <- data.table(
PCs = names(pcas),
"Total_%" = round(pca_var$perc_var, 2),
"Site_%" = round(sapply(pcas, function(x) (x['Site ', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Site_*p*" = round(sapply(pcas, function(x) x['Site ', 'Pr..F.']), 4),
"Scion_%" = round(sapply(pcas, function(x) (x['Scion', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Scion_*p*" = round(sapply(pcas, function(x) x['Scion', 'Pr..F.']), 4),
"Season_%" = round(sapply(pcas, function(x) (x['Season ', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Season_*p*" = round(sapply(pcas, function(x) x['Season ', 'Pr..F.']), 4),
"Site:Scion_%" = round(sapply(pcas, function(x) (x['Site:Scion', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Site:Scion_*p*" = round(sapply(pcas, function(x) x['Site:Scion', 'Pr..F.']), 4),
"Site:Season_%" = round(sapply(pcas, function(x) (x['Site:Season ', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Site:Season_*p*" = round(sapply(pcas, function(x) x['Site:Season ', 'Pr..F.']), 4),
"Season:Scion_%" = round(sapply(pcas, function(x) (x['Season:Scion', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2),
"Season:Scion_*p*" = round(sapply(pcas, function(x) x['Season:Scion', 'Pr..F.']), 4),
"Residuals_%" = round(sapply(pcas, function(x) (x['Residuals', 'Sum.Sq'] / sum(x['Sum.Sq']) * 100)), 2)
)
kable(pca_factors)
| PCs | Total_% | Site_% | Site_p | Scion_% | Scion_p | Season_% | Season_p | Site:Scion_% | Site:Scion_p | Site:Season_% | Site:Season_p | Season:Scion_% | Season:Scion_p | Residuals_% |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PC1 | 18.6 | 83.06 | 0.0000 | 1.84 | 0.1572 | 0.01 | 0.8106 | 3.66 | 0.1176 | 1.12 | 0.0603 | 0.67 | 0.7298 | 7.41 |
| PC2 | 12.7 | 87.68 | 0.0000 | 0.94 | 0.4468 | 0.09 | 0.4541 | 2.82 | 0.1716 | 0.38 | 0.3080 | 0.45 | 0.8218 | 6.34 |
| PC3 | 7.1 | 3.07 | 0.2221 | 15.69 | 0.0287 | 0.03 | 0.8519 | 12.33 | 0.4283 | 16.94 | 0.0008 | 9.35 | 0.1765 | 39.29 |
| PC4 | 4.0 | 10.61 | 0.0060 | 6.35 | 0.3446 | 1.28 | 0.2434 | 27.51 | 0.0142 | 1.27 | 0.5044 | 3.67 | 0.6722 | 36.38 |
| PC5 | 3.0 | 0.01 | 0.9945 | 3.98 | 0.7530 | 11.02 | 0.0038 | 7.05 | 0.9001 | 16.30 | 0.0025 | 5.42 | 0.5955 | 46.71 |
| PC6 | 2.1 | 0.13 | 0.9467 | 18.40 | 0.0278 | 5.80 | 0.0299 | 7.89 | 0.8486 | 3.14 | 0.2653 | 6.67 | 0.4560 | 45.73 |
| PC7 | 1.8 | 0.38 | 0.8556 | 10.27 | 0.2393 | 2.02 | 0.2070 | 11.07 | 0.6925 | 8.99 | 0.0344 | 4.42 | 0.7273 | 49.00 |
| PC8 | 1.6 | 0.57 | 0.8219 | 3.92 | 0.8423 | 2.86 | 0.1689 | 6.02 | 0.9751 | 1.74 | 0.5548 | 9.69 | 0.3751 | 58.32 |
| PC9 | 1.3 | 1.75 | 0.5410 | 3.95 | 0.8262 | 0.72 | 0.4773 | 7.29 | 0.9396 | 20.86 | 0.0018 | 3.64 | 0.8523 | 55.97 |
| PC10 | 1.3 | 0.80 | 0.6757 | 6.49 | 0.3964 | 2.24 | 0.1447 | 12.51 | 0.4409 | 13.48 | 0.0032 | 11.92 | 0.0938 | 40.44 |
| PC11 | 1.3 | 0.10 | 0.9705 | 9.29 | 0.4551 | 0.53 | 0.5676 | 10.91 | 0.8513 | 0.73 | 0.7957 | 5.24 | 0.7676 | 63.57 |
| PC12 | 1.2 | 0.19 | 0.9281 | 7.71 | 0.4233 | 0.39 | 0.5807 | 10.01 | 0.7726 | 3.18 | 0.2923 | 8.93 | 0.3327 | 50.12 |
| PC13 | 1.2 | 0.57 | 0.7763 | 4.46 | 0.6771 | 2.17 | 0.1708 | 28.97 | 0.0338 | 5.58 | 0.0947 | 5.82 | 0.5258 | 44.62 |
| PC14 | 1.1 | 0.64 | 0.7457 | 3.84 | 0.7382 | 0.09 | 0.7707 | 8.60 | 0.7802 | 11.15 | 0.0105 | 10.72 | 0.1619 | 43.62 |
| PC15 | 1.1 | 0.27 | 0.9067 | 10.93 | 0.2666 | 0.06 | 0.8287 | 12.68 | 0.6751 | 3.47 | 0.2923 | 5.57 | 0.6681 | 54.78 |
| PC16 | 1.1 | 0.21 | 0.9242 | 3.55 | 0.8451 | 8.19 | 0.0175 | 3.99 | 0.9936 | 6.10 | 0.1150 | 2.82 | 0.9045 | 53.39 |
| PC17 | 1.0 | 0.00 | 0.9982 | 5.28 | 0.6558 | 0.29 | 0.6336 | 11.90 | 0.6653 | 2.44 | 0.3913 | 10.50 | 0.2461 | 50.72 |
| PC18 | 1.0 | 0.56 | 0.7599 | 6.66 | 0.3868 | 1.32 | 0.2628 | 21.25 | 0.0955 | 2.12 | 0.3629 | 15.30 | 0.0381 | 40.86 |
| PC19 | 1.0 | 0.50 | 0.8270 | 8.11 | 0.4230 | 0.13 | 0.7517 | 18.01 | 0.3585 | 1.41 | 0.5899 | 6.63 | 0.5482 | 52.73 |
| PC20 | 1.0 | 0.14 | 0.9281 | 19.76 | 0.0056 | 1.26 | 0.2442 | 15.22 | 0.2041 | 1.30 | 0.4922 | 8.21 | 0.1983 | 36.13 |
bac_pca_plot <- plotOrd(
bac_pca,
colData(dds),
design = "Site",
shape = "Season",
axes = c(1, 2),
# facet = "Season",
cbPalette = T,
alpha = 0.75,
) #+ facet_wrap(~facet)
ggsave(filename = "bac_pca_plot.png", plot = bac_pca_plot, path = "figures/")
bac_pca_plot
bac_pca_2_3_plot <- plotOrd(
bac_pca,
colData(dds),
design = "Scion",
shape = "Site",
axes = c(2, 3),
cbPalette = T,
alpha = 0.75,
)
ggsave(filename = "bac_pca_2_3_plot.png", plot = bac_pca_2_3_plot, path = "figures/")
sum_squares <- apply(mypca$x, 2 ,function(x)
summary(aov(update(formula, x ~ .), data = cbind(x, colData(dds))))[[1]][2]
)
sum_squares <- do.call(cbind, sum_squares)
x <- t(apply(sum_squares, 2, prop.table))
perVar <- x * mypca$percentVar
#colSums(perVar)
round(colSums(perVar) / sum(colSums(perVar)) * 100, 3)
# Site Season Scion Site:Season Site:Scion Season:Scion Site:Season:Scion Residuals
# 27.377 1.163 6.201 3.608 11.077 5.262 9.804 35.509
# Calculate Bray-Curtis distance matrix
vg <- vegdist(t(counts(dds, normalize = T)), method = "bray")
formula <- update(FULL_DESIGN, vg ~ .)
set.seed(sum(utf8ToInt("Hamish McLean")))
result <- adonis2(formula, colData(dds), permutations = 1000)
result
# Permutation test for adonis under reduced model
# Terms added sequentially (first to last)
# Permutation: free
# Number of permutations: 1000
#
# adonis2(formula = formula, data = colData(dds), permutations = 1000)
# Df SumOfSqs R2 F Pr(>F)
# Site 2 4.6757 0.31606 19.4091 0.000999 ***
# Season 1 0.2240 0.01514 1.8596 0.056943 .
# Scion 6 0.9812 0.06633 1.3577 0.062937 .
# Site:Season 2 0.7189 0.04859 2.9841 0.002997 **
# Site:Scion 12 1.4891 0.10066 1.0302 0.406593
# Season:Scion 6 0.6866 0.04641 0.9500 0.556444
# Site:Season:Scion 12 1.2000 0.08112 0.8302 0.891109
# Residual 40 4.8180 0.32569
# Total 81 14.7934 1.00000
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% data.frame()
df$Perc.Var <- df$SumOfSqs / df["Total", "SumOfSqs"] * 100
df
# Df SumOfSqs R2 F Pr..F. Perc.Var
# Site 2 4.6756525 0.31606401 19.4090620 0.000999001 31.606401
# Season 1 0.2239849 0.01514090 1.8595639 0.056943057 1.514090
# Scion 6 0.9812079 0.06632754 1.3576946 0.062937063 6.632754
# Site:Season 2 0.7188806 0.04859478 2.9841392 0.002997003 4.859478
# Site:Scion 12 1.4890925 0.10065944 1.0302266 0.406593407 10.065944
# Season:Scion 6 0.6865717 0.04641076 0.9500073 0.556443556 4.641076
# Site:Season:Scion 12 1.1999721 0.08111552 0.8301990 0.891108891 8.111552
# Residual 40 4.8180097 0.32568705 NA NA 32.568705
# Total 81 14.7933719 1.00000000 NA NA 100.000000
set.seed(sum(utf8ToInt("Hamish McLean")))
ord <- metaMDS(vg,trace=0)
#sratmax=20000,maxit=20000,try = 177, trymax = 177
bac_nmds <- scores(ord)
bac_nmds_plot <- plotOrd(
bac_nmds, colData(dds),
design = "Site",
shape = "Season",
alpha = 0.75, cbPalette = T
) #+ theme(text = element_text(size = 14))
ggsave(filename = "fun_nmds_plot.png", plot = bac_nmds_plot, path = "figures/")
bac_nmds_plot
# Extract normalised counts from DESeq object
asv_counts <- counts(dds, normalize = T) %>% as.data.frame()
# Sum ASV counts across samples
total_asv_counts <- rowSums(asv_counts)
# Sort ASVs by abundance
total_asv_counts <- total_asv_counts[order(total_asv_counts, decreasing = T)]
# Caculate cumulative percentage
cumulative <- data.frame(
cumulative = cumsum(total_asv_counts) / sum(total_asv_counts) * 100,
no = seq_along(total_asv_counts)
)
# Plot cumulative percentage of ASVs
bac_cum_asv <- ggline(
data = cumulative, x = "no", y = "cumulative",
plot_type = "l", palette = cbPalette,
title = "Cumulative percentage of bacterial ASVs", xlab = "Number of ASVs",
ylab = "Cumulative percentage of reads"
)
ggsave(filename = "bac_cum_asv.png", plot = bac_cum_asv, path = "figures/")
bac_cum_asv
# Find the number of ASVs that account for 50%, 80%, and 99% of total reads
cat(
"Number of ASVs that account for 50%, 80%, and 99% of total reads", "\n\n",
"50%:", sum(cumulative <= 50), "\n",
"80%:", sum(cumulative <= 80), "\n",
"99%:", sum(cumulative <= 99), "\n"
)
# Number of ASVs that account for 50%, 80%, and 99% of total reads
#
# 50%: 205
# 80%: 1055
# 99%: 5037
# Find the cumulative percentage accounted for by top x ASVs
cat(
"Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:", "\n\n",
"100:", round(cumulative[cumulative$no == 100, "cumulative"], 1) , "\n",
"200:", round(cumulative[cumulative$no == 200, "cumulative"], 1) , "\n",
"500:", round(cumulative[cumulative$no == 500, "cumulative"], 1) , "\n"
)
# Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:
#
# 100: 43.7
# 200: 53.9
# 500: 69.1
# Average ASV counts in order
mean_asv_counts <- rowMeans(asv_counts)
mean_asv_counts <- mean_asv_counts[order(mean_asv_counts, decreasing = T)]
# Plot read count distribution
bac_asv_counts <- ggline(
data = data.frame(ASV = seq_along(mean_asv_counts), counts = mean_asv_counts),
x = "ASV", y = "counts", plot_type = "l",
title = "Bacterial ASV read count distribution", xlab = "ASV", ylab = "Mean read count"
)
ggsave(filename = "bac_asv_counts.png", plot = bac_asv_counts, path = "figures/")
bac_asv_counts
# Number of ASVs with mean read count > 100, 200, and 500
cat(
"Number of ASVs with mean read count > 100, 200, and 500", "\n\n",
"100:", sum(rowMeans(asv_counts) > 100), "\n",
"200:", sum(rowMeans(asv_counts) > 200), "\n",
"500:", sum(rowMeans(asv_counts) > 500), "\n"
)
# Number of ASVs with mean read count > 100, 200, and 500
#
# 100: 71
# 200: 32
# 500: 8
# Filter the top x abundant OTUs by the sum of their normalised counts
# top_asvs <- asv_counts[order(rowSums(asv_counts), decreasing = T)[1:DIFFOTU], ]
# Filter ASVs with mean read count > 100
top_asvs <- asv_counts[rowMeans(asv_counts) > 100, ]
# Check that sample names match
identical(names(top_asvs), rownames(colData))
# [1] TRUE
# Extract taxonomic data for top OTUs
top_taxa <- taxData[rownames(top_asvs), ]
# Log transform normalised counts
# top_asvs <- log10(top_asvs + 1) # Log transform in models instead
top_asv_data <- data.frame(t(top_asvs))
top_asv_ids <- rownames(top_asvs)
identical(rownames(top_asv_data), rownames(colData))
# [1] TRUE
top_asv_data <- merge(top_asv_data, colData, by = 0) %>% column_to_rownames("Row.names")
Effect of Site, Scion, and planting season on abundance of top 200 ASVs
# ANOVA of top ASVs
otu_lm_anova <- function(otu, formula, data) {
f = update(formula, paste0("log(", otu, " + 1) ~ ."))
a = aov(f, data = data) %>% summary() %>% unclass() %>% data.frame()
total_var = sum(a$Sum.Sq)
d = data.frame(
OTU = otu,
Taxonomy = taxData[otu, "rank"],
site_var = a['Site ', 'Sum.Sq'] / total_var * 100,
site_p = a['Site ', 'Pr..F.'],
scion_var = a['Scion', 'Sum.Sq'] / total_var * 100,
scion_p = a['Scion', 'Pr..F.'],
season_var = a['Season ', 'Sum.Sq'] / total_var * 100,
season_p = a['Season ', 'Pr..F.'],
site_scion_var = a['Site:Scion', 'Sum.Sq'] / total_var * 100,
site_scion_p = a['Site:Scion', 'Pr..F.'],
site_season_var = a['Site:Season ', 'Sum.Sq'] / total_var * 100,
site_season_p = a['Site:Season ', 'Pr..F.'],
season_scion_var = a['Season:Scion', 'Sum.Sq'] / total_var * 100,
season_scion_p = a['Season:Scion', 'Pr..F.'],
residuals_var = a['Residuals', 'Sum.Sq'] / total_var * 100
)
return(d)
}
# Negative binomial regression model
otu_negbin_anova <- function(otu, formula, data) {
f = update(formula, paste0(otu, " ~ ."))
m = glm.nb(f, data = data)
a = anova(m, test = "Chisq") %>% data.frame()
total_deviance = sum(a$Deviance, na.rm = T) + tail(a$Resid..Dev, 1)
d = data.frame(
OTU = otu,
Taxonomy = taxData[otu, "rank"],
site_var = a['Site', 'Deviance'] / total_deviance * 100,
site_p = a['Site', 'Pr..Chi.'],
scion_var = a['Scion', 'Deviance'] / total_deviance * 100,
scion_p = a['Scion', 'Pr..Chi.'],
season_var = a['Season', 'Deviance'] / total_deviance * 100,
season_p = a['Season', 'Pr..Chi.']
)
return(d)
}
formula <- FULL_DESIGN
otu_anova_results <- sapply(
top_asv_ids, function(x) otu_lm_anova(x, formula, top_asv_data)
) %>% t() %>% data.table()
otu_anova_results_adjusted <- otu_anova_results %>%
mutate(
site_p = p.adjust(site_p, method = "BH"),
scion_p = p.adjust(scion_p, method = "BH"),
season_p = p.adjust(season_p, method = "BH")
)
cat(
"Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values", "\n\n",
"Site:", nrow(otu_anova_results_adjusted[site_p < 0.05, ]), "\n",
"Scion:", nrow(otu_anova_results_adjusted[scion_p < 0.05, ]), "\n",
"Season:", nrow(otu_anova_results_adjusted[season_p < 0.05, ]), "\n",
"Site:Scion:", nrow(otu_anova_results_adjusted[site_scion_p < 0.05, ]), "\n",
"Site:Season:", nrow(otu_anova_results_adjusted[site_season_p < 0.05, ]), "\n",
"Season:Scion:", nrow(otu_anova_results_adjusted[season_scion_p < 0.05, ]), "\n\n"
)
# Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values
#
# Site: 68
# Scion: 6
# Season: 0
# Site:Scion: 3
# Site:Season: 50
# Season:Scion: 2
otu_anova_results_adjusted[scion_p < 0.05, ]
# OTU Taxonomy site_var site_p scion_var scion_p season_var season_p site_scion_var site_scion_p site_season_var site_season_p season_scion_var season_scion_p residuals_var
# 1: OTU11 Actinoplanes(g) 57.01493 1.077636e-13 8.779256 0.04194029 0.8337958 0.9505991 3.047229 0.7409656 10.65662 1.630064e-05 2.795821 0.2855695 14.49169
# 2: OTU17 Mycobacteriaceae(f) 63.94552 2.523714e-16 8.075128 0.02035916 0.150778 0.9505991 4.634767 0.1890384 5.824716 0.0001719382 2.541607 0.1787341 10.73637
# 3: OTU28 Amycolatopsis(g) 57.08183 4.611778e-13 11.85351 0.02035916 0.123614 0.9505991 3.306424 0.7658573 3.976902 0.0128961 4.084545 0.1551837 16.36471
# 4: OTU7 Bradyrhizobium(g) 13.22873 9.450589e-04 18.27876 0.04194029 0.5012272 0.9505991 11.13561 0.3192109 14.725 0.000424495 8.413546 0.1223116 31.04119
# 5: OTU70 Kribbella(g) 37.87329 4.859133e-10 11.17966 0.04194029 0.2103298 0.9505991 7.135868 0.2595534 17.38269 1.649034e-06 4.745871 0.1409803 18.37531
# 6: OTU83 Steroidobacter(g) 1.078752 5.409558e-01 25.67554 0.02035916 0.01326388 0.9539102 13.74426 0.2430695 9.945972 0.006369712 9.190067 0.1297052 34.57801
otu_anova_results_adjusted[site_scion_p < 0.05, ]
# OTU Taxonomy site_var site_p scion_var scion_p season_var season_p site_scion_var site_scion_p site_season_var site_season_p season_scion_var season_scion_p residuals_var
# 1: OTU27 Micromonosporaceae(f) 43.52383 1.608145e-11 5.694322 0.1315376 0.3820512 0.9505991 11.76138 0.01829337 10.92462 3.376895e-05 2.136838 0.5197268 16.22546
# 2: OTU30 Streptomyces(g) 51.85274 2.149921e-11 5.906322 0.1678927 2.254467 0.6911820 11.93521 0.04910443 4.422486 0.01768693 2.226376 0.6128633 19.78407
# 3: OTU5 Streptomyces(g) 43.70563 7.650936e-11 3.363344 0.4265673 0.1340044 0.9505991 13.1956 0.01973321 13.96242 1.270369e-05 2.664643 0.462205 18.44128
otu_anova_results_adjusted[season_scion_p < 0.05, ]
# OTU Taxonomy site_var site_p scion_var scion_p season_var season_p site_scion_var site_scion_p site_season_var site_season_p season_scion_var season_scion_p residuals_var
# 1: OTU38 Mycobacterium(g) 37.98721 1.099712e-08 11.27463 0.05950441 0.04875582 0.9505991 7.425487 0.4346678 2.49345 0.1367133 9.065908 0.03560492 23.83539
# 2: OTU8 Actinoplanes(g) 62.80041 4.193815e-17 4.119011 0.06706237 0.8239168 0.8162242 4.148908 0.1593185 11.20464 1.110335e-07 6.314704 0.001203779 9.131464
Testing the effects of of ASV abundance, α-diversity, and β-diversity on canker counts.
This uses a nested negative binomial regression model.
The base model for canker counts uses the formula: Cankers ~ Site * Season * Scion.
# Filter out samples with missing canker count
canker_abundance_data <- colData[complete.cases(colData$Cankers), ]
# Base model
canker_design = "Cankers ~ Site * Season * Scion"
base_model <- glm.nb(canker_design, data = canker_abundance_data)
# Abundance model
abundance_design = paste(canker_design, "+ log(copy_number)")
abundance_model <- glm.nb(abundance_design, data = canker_abundance_data)
# ANOVA of abundance with canker count
kable(anova(base_model, abundance_model))
| Model | theta | Resid. df | 2 x log-lik. | Test | df | LR stat. | Pr(Chi) |
|---|---|---|---|---|---|---|---|
| Site * Season * Scion | 2.882191 | 33 | -554.6118 | NA | NA | NA | |
| Site * Season * Scion + log(copy_number) | 2.980708 | 32 | -551.9342 | 1 vs 2 | 1 | 2.677645 | 0.1017661 |
# Filter out samples with missing canker count
top_asv_data <- top_asv_data[complete.cases(top_asv_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Season * Scion"
# Base model with ASV abundance data
base_model <- glm.nb(canker_design, data = top_asv_data)
# ANOVA of top ASVs with canker count
asv_canker_anova <- function(otu, design, base_model, data) {
log_otu = paste0("log(", otu, " + 1)")
f = paste(design, "+", log_otu)#, "+", log_otu, ":Site")
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = suppressWarnings(anova(m)) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.frame(
OTU = otu,
Taxonomy = taxData[otu, "rank"],
coef = m$coefficients[log_otu],
var = b[log_otu, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# kable(asv_canker_anova(top_asv_ids[1], canker_design, base_model, canker_top_asv_data))
# Effect of ASV abundance on canker count for top ASVs
asv_canker_results <- sapply(
top_asv_ids,
function(x) asv_canker_anova(x, canker_design, base_model, top_asv_data)
) %>% t() %>% data.table()
# Adjust p-values for multiple testing
asv_canker_results$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")
# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
nrow(asv_canker_results[p_adjusted < 0.05, ]),
"ASVs have statistically significant (*P* < 0.05) adjusted p-values"
)
# 1 ASVs have statistically significant (*P* < 0.05) adjusted p-values
asv_canker_results[p_adjusted < 0.05, ]
# OTU Taxonomy coef var p p_adjusted
# 1: OTU40 Mycobacteriales(o) -0.5353163 1.070081 0.0004620826 0.03280786
# Add genus from taxData to countData
bac_genus_data <- counts(dds, normalize = T) %>% as.data.frame() %>% mutate(
genus = taxData[rownames(countData), "genus"]
)
# Group by genus and set genus to rownames
bac_genus_data <- bac_genus_data %>% group_by(genus) %>% summarise_all(sum) %>% as.data.frame()
# Set rownames as genus
rownames(bac_genus_data) <- bac_genus_data$genus
bac_genus_data <- dplyr::select(bac_genus_data, -genus)
# Filter genera with mean abundance < 100
bac_genus_data <- bac_genus_data[rowMeans(bac_genus_data) > 10, ]
# Rank not genus
not_genus <- rownames(bac_genus_data)[grep("\\([a-z]\\)", rownames(bac_genus_data))]
# Remove rows with genus in not_genus
bac_genus_data <- bac_genus_data[!rownames(bac_genus_data) %in% not_genus, ]
cat(
length(not_genus), "non-genus ranks removed:\n\n",
not_genus, "\n"
)
# 0 non-genus ranks removed:
#
#
# Remove any with slashes or underscores
removals <- rownames(bac_genus_data)[grep("[/_]", rownames(bac_genus_data))]
# Remove rows with symbols in the name
bac_genus_data <- bac_genus_data[!rownames(bac_genus_data) %in% removals, ]
cat(
length(removals), "ranks with slashes or underscores removed:\n\n",
removals, "\n"
)
# 4 ranks with slashes or underscores removed:
#
# Armatimonas/Armatimonadetes_gp1 Candidatus_Solibacter Saccharibacteria_genera_incertae_sedis Subdivision3_genera_incertae_sedis
# Set rownames as genus
bac_genera <- rownames(bac_genus_data)
# Transpose and add metadata from colData
bac_genus_data <- t(bac_genus_data) %>% as.data.frame() %>% mutate(
Site = colData[rownames(.), "Site"],
Season = colData[rownames(.), "Season"],
Scion = colData[rownames(.), "Scion"],
Cankers = colData[rownames(.), "Cankers"]
)
# Filter out samples with missing canker count
bac_genus_data <- bac_genus_data[complete.cases(bac_genus_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Season * Scion"
# Base model with genus abundance data
base_model <- glm.nb(canker_design, data = bac_genus_data)
# ANOVA of genus abundance with canker count
genus_canker_anova <- function(genus, design, base_model, data) {
log_genus = paste0("log(", genus, " + 1)")
f = paste(design, "+", log_genus)#, "+", log_genus, ":Site")
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = suppressWarnings(anova(m)) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.table(
Genus = genus,
Abundance = mean(data[[genus]]),
coef = m$coefficients[log_genus],
var = b[log_genus, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# genus_canker_anova(bac_genera[1], canker_design, base_model, bac_genus_data)
# Effect of genera abundance on canker counts
genus_canker_results <- sapply(
bac_genera,
function(x) genus_canker_anova(x, canker_design, base_model, bac_genus_data)
) %>% t() %>% data.table()
# Adjust p-values for multiple testing
genus_canker_results$p_adjusted <- p.adjust(genus_canker_results$p, method = "BH")
# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
nrow(genus_canker_results[p_adjusted < 0.05, ]), "of", nrow(genus_canker_results),
"genera have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 2 of 238 genera have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(genus_canker_results[p_adjusted < 0.05, ]) > 0) {
genus_canker_results[p_adjusted < 0.05, ]
}
# Genus Abundance coef var p p_adjusted
# 1: Marisediminicola 11.01665 -0.7908458 1.442856 1.264145e-05 1.504332e-03
# 2: Pedomicrobium 22.65009 0.7736918 1.317756 3.761193e-07 8.951639e-05
# ANOVA of α-diversity with canker count
# Base model with α-diversity data
base_model <- glm.nb(canker_design, data = all_alpha_ord)
measures <- c("S.chao1", "shannon", "simpson")
# ANOVA of α-diversity with canker count
alpha_canker_anova <- function(measure, data) {
f = paste(canker_design, "+", measure)
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = anova(m) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.frame(
measure = measure,
coef = m$coefficients[measure],
var = b[measure, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# Effect of α-diversity on canker count for each measure
alpha_canker_results <- data.table(t(sapply(measures, function(x) alpha_canker_anova(x, all_alpha_ord))))
alpha_canker_results
# measure coef var p
# 1: S.chao1 0.0003260492 0.1949192 0.3631458
# 2: shannon 1.047428 3.043469 0.03415997
# 3: simpson 74.20115 2.554056 0.01184907
no_pcs <- 4
# Merge PC scores with canker data
pc_scores <- merge(colData, data.frame(mypca$x[, 1:no_pcs]), by = "row.names") %>%
column_to_rownames("Row.names")
pcs <- tail(colnames(pc_scores), no_pcs)
# Base model with β-diversity data
base_model <- glm.nb(canker_design, data = pc_scores)
# ANOVA of β-diversity with canker count
beta_canker_anova <- function(pc, data) {
f = paste0(canker_design, "+", pc)
m = glm.nb(f, data = data)
a = anova(base_model, m) %>% data.frame()
b = anova(m) %>% data.frame()
total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
d = data.frame(
PC = pc,
coef = m$coefficients[pc],
var = b[pc, 'Deviance'] / total_deviance * 100,
p = a[2, 'Pr.Chi.']
)
return(d)
}
# Effect of β-diversity on canker count for each PC
beta_canker_results <- data.table(t(sapply(pcs, function(x) beta_canker_anova(x, pc_scores))))
beta_canker_results
# PC coef var p
# 1: PC1 -0.02475561 0.2245606 0.0001345953
# 2: PC2 -0.007178287 0.02763263 0.3213615
# 3: PC3 -0.003666729 0.3145477 0.4859172
# 4: PC4 0.01570856 0.2983212 0.009036866
abundance_combined <- rbind(
as.data.frame(colData(ubiome_FUN$dds)), as.data.frame(colData(ubiome_BAC$dds))
) %>% mutate(kingdom = ifelse(Target == "ITS", "Fungi", "Bacteria"))
abundance_bar <- ggbarplot(
data = abundance_combined, x = "Season", y = "log_copy_number",
fill = "Site", add = "mean_se", facet.by = "kingdom",
palette = cbPalette, position = position_dodge(0.8),
ylab = "Mean copy number (log10)", xlab = F, legend = "right"
) + guides(fill = guide_legend(title = "Site"))
ggsave(
filename = "abundance_bar.png", plot = abundance_bar, path = "figures/",
height = 12, width = 24, units = "cm"
)
# abundance_bar
abundance_box <- ggboxplot(
data = abundance_combined, x = "Site", y = "log_copy_number",
color = "Season", add = "jitter", facet.by = "kingdom",
palette = cbPalette, ylab = "Mean copy number (log10)", xlab = "Site"
) + guides(color = guide_legend(position = "right"))
ggsave(
filename = "abundance_box.png", plot = abundance_box, path = "figures/",
height = 12, width = 24, units = "cm"
)
abundance_box
alpha_combined <- rbind(fun_alpha, bac_alpha) %>%
subset(select = c("Site", "Season", "Scion", "Target", "shannon", "simpson")) %>%
mutate(kingdom = ifelse(Target == "ITS", "Fungi", "Bacteria")) %>%
pivot_longer(
cols = c("shannon", "simpson"), names_to = "measure", values_to = "value"
)
alpha_bar <- ggbarplot(
data = alpha_combined, x = "Season", y = "value",
fill = "Site", add = "mean_se", facet.by = c("measure", "kingdom"),
palette = cbPalette, position = position_dodge(0.8), scales = "free_y",
ylab = "Mean diversity index", xlab = F, legend = "right"
) + guides(fill = guide_legend(title = "Site"))
ggsave(
filename = "alpha_bar.png", plot = alpha_bar, path = "figures/",
height = 12, width = 24, units = "cm"
)
alpha_bar
alpha_box <- ggboxplot(
data = alpha_combined, x = "Site", y = "value",
color = "Season", add = "jitter", facet.by = c("measure", "kingdom"),
palette = cbPalette, scales = "free_y",
ylab = "Mean diversity index", xlab = "Site"
) + guides(color = guide_legend(position = "right"))
ggsave(
filename = "alpha_box.png", plot = alpha_box, path = "figures/",
height = 12, width = 24, units = "cm"
)
alpha_box
pca_combo_plot <- ggarrange(
fun_pca_plot, bac_pca_plot,
ncol = 2, nrow = 1, labels = c("A", "B"),
common.legend = T, legend = "right"
)
ggsave(
filename = "pca_combo_plot.png", plot = pca_combo_plot, path = "figures/",
height = 18, width = 25, units = "cm"
)
pca_combo_plot
nmds_combo_plot <- ggarrange(
fun_nmds_plot, bac_nmds_plot,
ncol = 2, nrow = 1, labels = c("A", "B"),
common.legend = T, legend = "right"
)
ggsave(
filename = "nmds_combo_plot.png", plot = nmds_combo_plot, path = "figures/",
height = 18, width = 25, units = "cm"
)
nmds_combo_plot
mega_combo_plot <- ggarrange(
fun_pca_plot, bac_pca_plot,
fun_nmds_plot, bac_nmds_plot,
ncol = 2, nrow = 2, labels = c("A", "B", "C", "D"),
common.legend = T, legend = "bottom"
) + labs(shape = "Planting season")
ggsave(
filename = "mega_combo_plot.png", plot = mega_combo_plot, path = "figures/",
height = 25, width = 30, units = "cm"
)
mega_combo_plot
# Save environment
save.image("BAC.RData")